File size: 2,010 Bytes
91e7690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab076fb
 
91e7690
 
3e987ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from tasks.base import BaseTask
from env.models import AuditReport


class Task1(BaseTask):
    def get_description(self) -> str:
        return (
            "Audit the 'customers' table. Find: (1) real NULL values in each column, "
            "(2) disguised nulls stored as strings like 'NULL','N/A','-' etc., "
            "(3) exact duplicate rows, and (4) near-duplicate rows (same record, 1-2 fields changed). "
            "Report counts per finding with your confidence (0.0-1.0) in each."
        )

    def get_table_names(self) -> list[str]:
        return ["customers"]

    def grade(self, report: AuditReport, gold: dict) -> tuple[float, dict]:
        scores: dict[str, float] = {}
        if "email" in report.null_issues:
            fc = report.null_issues["email"]
            acc = self.count_accuracy(int(fc.value), int(gold["null_email_total"]))
            scores["null_email"] = self.brier_adjust(acc, fc.confidence, acc > 0.6)
        else:
            scores["null_email"] = 0.0

        if "customer_id" in report.null_issues:
            fc = report.null_issues["customer_id"]
            acc = self.count_accuracy(int(fc.value), int(gold["null_customer_id"]))
            scores["null_cid"] = self.brier_adjust(acc, fc.confidence, acc > 0.6)
        else:
            scores["null_cid"] = 0.0

        fc_dup = report.duplicate_row_count
        dup_acc = self.count_accuracy(int(fc_dup.value), int(gold["exact_duplicate_rows"]))
        scores["exact_dups"] = self.brier_adjust(dup_acc, fc_dup.confidence, dup_acc > 0.6)

        near_detected = any("near" in str(v.get("issue_type", "")).lower() for v in report.schema_violations)
        scores["near_dups"] = 0.5 if near_detected else 0.0

        scores = {k: self.strict_score(v) for k, v in scores.items()}

        weights = {"null_email": 0.30, "null_cid": 0.25, "exact_dups": 0.30, "near_dups": 0.15}
        total = sum(scores[k] * weights[k] for k in weights)
        return self.strict_score(round(total, 4)), scores