File size: 2,959 Bytes
91e7690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab076fb
 
91e7690
 
3e987ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from tasks.base import BaseTask
from env.models import AuditReport


class Task3(BaseTask):
    def get_description(self) -> str:
        return (
            "Compare 'transactions_baseline' (last month) with 'transactions_current' (this month). "
            "Detect silent data drift: mean/distribution shifts in numeric columns, new category "
            "values not present in baseline, and referential drift (new user_ids not in baseline). "
            "Nothing is explicitly labelled wrong — you must find it statistically."
        )

    def get_table_names(self) -> list[str]:
        return ["transactions_baseline", "transactions_current"]

    def grade(self, report: AuditReport, gold: dict) -> tuple[float, dict]:
        scores: dict[str, float] = {}

        amount_drift = report.drift_details.get("amount")
        if amount_drift:
            detected = "shift" in str(amount_drift.value).lower() or "mean" in str(amount_drift.value).lower()
            scores["mean_shift"] = self.brier_adjust(1.0 if detected else 0.0, amount_drift.confidence, detected)
        else:
            scores["mean_shift"] = 0.0

        new_cat_mentioned = any(
            "categor" in str(v).lower() or "crypto" in str(v).lower() or "nft" in str(v).lower()
            for v in [report.drift_details, report.recommended_fixes]
        )
        cat_drift = report.drift_details.get("category")
        if cat_drift:
            reported_cats = {x.strip() for x in str(cat_drift.value).split(",") if x.strip()}
            actual_cats = set(gold["new_categories"])
            precision = len(reported_cats & actual_cats) / max(len(reported_cats), 1)
            recall = len(reported_cats & actual_cats) / max(len(actual_cats), 1)
            f1 = 2 * precision * recall / max(precision + recall, 1e-6)
            scores["new_cats"] = self.brier_adjust(f1, cat_drift.confidence, f1 > 0.4)
        else:
            scores["new_cats"] = 0.3 if new_cat_mentioned else 0.0

        ref_drift = report.drift_details.get("user_id")
        if ref_drift:
            try:
                cleaned = str(ref_drift.value).replace("%", " ").strip()
                token = cleaned.split()[0]
                reported_pct = float(token)
                if reported_pct > 1:
                    reported_pct /= 100.0
                actual_pct = float(gold["referential_drift_pct"])
                within_5pct = abs(reported_pct - actual_pct) <= 0.05
                scores["ref_drift"] = self.brier_adjust(1.0 if within_5pct else 0.5, ref_drift.confidence, within_5pct)
            except Exception:
                scores["ref_drift"] = 0.2
        else:
            scores["ref_drift"] = 0.0

        scores = {k: self.strict_score(v) for k, v in scores.items()}

        weights = {"mean_shift": 0.40, "new_cats": 0.35, "ref_drift": 0.25}
        total = sum(scores[k] * weights[k] for k in weights)
        return self.strict_score(round(total, 4)), scores