File size: 6,647 Bytes
12d831f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Evaluate the current predictor on a separate local field-test dataset.

Expected folder layout:
    data/local_test/
        plastic/
        paper/
        organic/
        metal/
        glass/
        unknown/   optional
        residu/    optional alias for unknown

Usage:
    python scripts/evaluate_local.py --data_dir data/local_test --output_dir reports/local_eval
"""

import argparse
import csv
import json
import os
import sys
from pathlib import Path

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
MPL_CONFIG_DIR = os.path.join(PROJECT_ROOT, ".cache", "matplotlib")
os.makedirs(MPL_CONFIG_DIR, exist_ok=True)
os.environ.setdefault("MPLCONFIGDIR", MPL_CONFIG_DIR)
sys.path.insert(0, PROJECT_ROOT)

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

from app.predictor import CLASS_NAMES, WastePredictor

KNOWN_CLASSES = CLASS_NAMES
UNKNOWN_ALIASES = {"unknown", "residu", "residue", "other", "lainnya"}
EVAL_LABELS = KNOWN_CLASSES + ["unknown"]
INPUT_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
AUTO_APPROVE_THRESHOLD = float(os.getenv("AUTO_APPROVE_THRESHOLD", "0.85"))
AMBIGUITY_GAP_THRESHOLD = float(os.getenv("AMBIGUITY_GAP_THRESHOLD", "0.15"))
UNKNOWN_ON_AMBIGUOUS = os.getenv("UNKNOWN_ON_AMBIGUOUS", "true").lower() == "true"


def normalize_label(label: str) -> str:
    normalized = label.strip().lower()
    if normalized in UNKNOWN_ALIASES:
        return "unknown"
    return normalized


def collect_images(data_dir: Path) -> list[tuple[Path, str]]:
    rows = []
    for class_dir in sorted(data_dir.iterdir()):
        if not class_dir.is_dir():
            continue

        label = normalize_label(class_dir.name)
        if label not in EVAL_LABELS:
            print(f"Skipping unsupported folder: {class_dir}")
            continue

        for image_path in sorted(class_dir.rglob("*")):
            if image_path.is_file() and image_path.suffix.lower() in INPUT_EXTS:
                rows.append((image_path, label))

    if not rows:
        raise ValueError(f"No evaluation images found under {data_dir}")

    return rows


def choose_decision_class(result: dict) -> str:
    ranked_scores = sorted(result["all_scores"].items(), key=lambda item: item[1], reverse=True)
    top_class, top_score = ranked_scores[0]
    second_score = ranked_scores[1][1] if len(ranked_scores) > 1 else top_score
    confidence_gap = float(top_score) - float(second_score)

    if float(result["confidence"]) < AUTO_APPROVE_THRESHOLD:
        return "unknown"
    if UNKNOWN_ON_AMBIGUOUS and confidence_gap < AMBIGUITY_GAP_THRESHOLD:
        return "unknown"
    return top_class


def save_confusion_matrix(y_true: list[str], y_pred: list[str], output_path: Path) -> None:
    cm = confusion_matrix(y_true, y_pred, labels=EVAL_LABELS, normalize="true")
    fig, ax = plt.subplots(figsize=(8, 7))
    image = ax.imshow(cm, cmap="Blues", vmin=0, vmax=1)
    ax.set_xticks(range(len(EVAL_LABELS)))
    ax.set_yticks(range(len(EVAL_LABELS)))
    ax.set_xticklabels(EVAL_LABELS, rotation=45, ha="right")
    ax.set_yticklabels(EVAL_LABELS)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title("Local Field-Test Confusion Matrix")
    plt.colorbar(image, ax=ax)

    for row in range(len(EVAL_LABELS)):
        for col in range(len(EVAL_LABELS)):
            value = cm[row, col]
            ax.text(
                col,
                row,
                f"{value:.2f}",
                ha="center",
                va="center",
                fontsize=8,
                color="white" if value > 0.5 else "black",
            )

    plt.tight_layout()
    fig.savefig(output_path, dpi=150)
    plt.close(fig)


def main() -> None:
    parser = argparse.ArgumentParser(description="Evaluate model on a local field-test dataset.")
    parser.add_argument("--data_dir", default="data/local_test")
    parser.add_argument("--output_dir", default="reports/local_eval")
    args = parser.parse_args()

    data_dir = Path(args.data_dir)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    predictor = WastePredictor()
    predictor.load()

    samples = collect_images(data_dir)
    predictions = []
    y_true = []
    y_pred = []

    for image_path, actual_label in samples:
        result = predictor.predict(image_path.read_bytes())
        predicted_label = choose_decision_class(result)
        predicted_label = normalize_label(predicted_label)

        y_true.append(actual_label)
        y_pred.append(predicted_label)
        predictions.append(
            {
                "image_path": str(image_path),
                "actual": actual_label,
                "predicted": predicted_label,
                "model_class": result["class"],
                "confidence": result["confidence"],
                "raw_class": result.get("raw_class"),
                "raw_confidence": result.get("raw_confidence"),
                "all_scores": result["all_scores"],
                "correct": actual_label == predicted_label,
            }
        )

    report = classification_report(
        y_true,
        y_pred,
        labels=EVAL_LABELS,
        output_dict=True,
        zero_division=0,
    )
    readable_report = classification_report(
        y_true,
        y_pred,
        labels=EVAL_LABELS,
        zero_division=0,
    )

    (output_dir / "metrics.json").write_text(json.dumps(report, indent=2), encoding="utf-8")
    (output_dir / "classification_report.txt").write_text(readable_report, encoding="utf-8")
    save_confusion_matrix(y_true, y_pred, output_dir / "confusion_matrix.png")

    with (output_dir / "predictions.csv").open("w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(
            file,
            fieldnames=[
                "image_path",
                "actual",
                "predicted",
                "model_class",
                "confidence",
                "raw_class",
                "raw_confidence",
                "correct",
            ],
        )
        writer.writeheader()
        for row in predictions:
            writer.writerow({key: row.get(key) for key in writer.fieldnames})

    mistakes = [row for row in predictions if not row["correct"]]
    (output_dir / "mistakes.json").write_text(json.dumps(mistakes, indent=2), encoding="utf-8")

    print(readable_report)
    print(f"\nEvaluated images : {len(samples)}")
    print(f"Mistakes         : {len(mistakes)}")
    print(f"Output dir       : {output_dir.resolve()}")


if __name__ == "__main__":
    main()