File size: 6,115 Bytes
1314bf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile
from typing import Tuple, Optional
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


def create_confusion_matrix_plot(
    cm: np.ndarray, 
    accuracy: float, 
    labels: list = ['No', 'Yes']
) -> str:
    """
    Create a confusion matrix plot and save it to a temporary file.
    
    Args:
        cm: Confusion matrix array
        accuracy: Accuracy score
        labels: Labels for the confusion matrix
        
    Returns:
        Path to the saved plot file
    """
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix (Accuracy: {accuracy:.1%})')
    plt.ylabel('Ground Truth')
    plt.xlabel('Model Prediction')
    
    temp_file = tempfile.mktemp(suffix='.png')
    plt.savefig(temp_file, dpi=150, bbox_inches='tight')
    plt.close()
    
    return temp_file


def create_accuracy_table(df: pd.DataFrame) -> Tuple[pd.DataFrame, str, pd.DataFrame]:
    """
    Create accuracy metrics table and confusion matrix from results dataframe.
    
    Args:
        df: DataFrame with 'Ground Truth' and 'Binary Output' columns
        
    Returns:
        Tuple of (metrics_df, confusion_matrix_plot_path, confusion_matrix_values_df)
        
    Raises:
        ValueError: If insufficient data for binary classification
    """
    df_copy = df.copy()
    
    # Get unique values from both Ground Truth and Binary Output
    # Convert to string first, then apply .str operations
    ground_truth_values = df_copy['Ground Truth'].dropna().astype(str).str.lower().unique()
    binary_output_values = df_copy['Binary Output'].dropna().astype(str).str.lower().unique()
    
    # Combine and get all unique values
    all_values = set(list(ground_truth_values) + list(binary_output_values))
    all_values = [v for v in all_values if v.strip()]  # Remove empty strings
    
    if len(all_values) < 2:
        raise ValueError("Need at least 2 different values for binary classification")
    
    # Sort values to ensure consistent mapping (alphabetical order)
    sorted_values = sorted(all_values)
    
    # Create mapping: first value (alphabetically) = 0, second = 1
    # This ensures consistent mapping regardless of order in data
    value_mapping = {sorted_values[0]: 0}
    if len(sorted_values) >= 2:
        value_mapping[sorted_values[1]] = 1
    
    # If there are more than 2 values, map the rest to 1 (positive class)
    for i in range(2, len(sorted_values)):
        value_mapping[sorted_values[i]] = 1
    
    print(f"Detected binary mapping: {value_mapping}")
    
    # Apply mapping - convert to string first, then apply .str operations
    df_copy['Ground Truth Binary'] = df_copy['Ground Truth'].astype(str).str.lower().map(value_mapping)
    df_copy['Binary Output Binary'] = df_copy['Binary Output'].astype(str).str.lower().map(value_mapping)
    
    # Remove rows where either ground truth or binary output is NaN
    df_copy = df_copy.dropna(subset=['Ground Truth Binary', 'Binary Output Binary'])
    
    if len(df_copy) == 0:
        raise ValueError("No valid data for accuracy calculation after mapping. Check that Ground Truth and Binary Output contain valid binary values.")
    
    # Calculate metrics
    cm = confusion_matrix(df_copy['Ground Truth Binary'], df_copy['Binary Output Binary'])
    accuracy = accuracy_score(df_copy['Ground Truth Binary'], df_copy['Binary Output Binary'])
    precision = precision_score(df_copy['Ground Truth Binary'], df_copy['Binary Output Binary'], zero_division=0)
    recall = recall_score(df_copy['Ground Truth Binary'], df_copy['Binary Output Binary'], zero_division=0)
    f1 = f1_score(df_copy['Ground Truth Binary'], df_copy['Binary Output Binary'], zero_division=0)
    
    # Create metrics dataframe
    metrics_data = [
        ["Accuracy", f"{accuracy:.3f}"],
        ["Precision", f"{precision:.3f}"],
        ["Recall", f"{recall:.3f}"],
        ["F1 Score", f"{f1:.3f}"],
        ["Total Samples", f"{len(df_copy)}"]
    ]
    metrics_df = pd.DataFrame(metrics_data, columns=["Metric", "Value"])
    
    # Create labels for confusion matrix based on detected values
    # Find the original case versions of the labels
    original_labels = []
    for mapped_val in sorted([k for k, v in value_mapping.items() if v in [0, 1]]):
        # Find original case version from the data
        original_case = None
        for val in df_copy['Ground Truth'].dropna():
            if str(val).lower() == mapped_val:
                original_case = str(val)
                break
        if original_case is None:
            for val in df_copy['Binary Output'].dropna():
                if str(val).lower() == mapped_val:
                    original_case = str(val)
                    break
        original_labels.append(original_case if original_case else mapped_val.title())
    
    # Ensure we have exactly 2 labels
    if len(original_labels) < 2:
        original_labels = ['Class 0', 'Class 1']
    
    cm_plot_path = create_confusion_matrix_plot(cm, accuracy, original_labels)
    
    # Confusion matrix values table
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        cm_values = pd.DataFrame(
            [[tn, fp], [fn, tp]],
            columns=[f"Predicted {original_labels[0]}", f"Predicted {original_labels[1]}"],
            index=[f"Actual {original_labels[0]}", f"Actual {original_labels[1]}"]
        )
    else:
        cm_values = pd.DataFrame(cm)
    
    return metrics_df, cm_plot_path, cm_values


def save_dataframe_to_csv(df: pd.DataFrame) -> Optional[str]:
    """
    Save dataframe to a temporary CSV file.
    
    Args:
        df: DataFrame to save
        
    Returns:
        Path to saved CSV file or None if failed
    """
    if df is None or df.empty:
        return None
        
    temp_file = tempfile.mktemp(suffix='.csv')
    df.to_csv(temp_file, index=False)
    return temp_file