Spaces:

kumar6591
/

data-quality-env

Sleeping

File size: 2,611 Bytes

from tasks.base import BaseTask
from env.models import AuditReport


class Task4(BaseTask):
    def get_description(self) -> str:
        return (
            "Audit three related tables: 'customers', 'orders', 'line_items'. "
            "Find: (1) orphaned orders (customer_id in orders not in customers), "
            "(2) temporal violations (ship_date before order_date), "
            "(3) aggregate mismatches (order.order_total != SUM(line_items.price * quantity) for that order). "
            "You MUST write JOIN queries across tables. Report count per issue type with confidence."
        )

    def get_table_names(self) -> list[str]:
        return ["customers", "orders", "line_items"]

    def grade(self, report: AuditReport, gold: dict) -> tuple[float, dict]:
        scores: dict[str, float] = {}

        def find_relational_issue(issue_type_keyword: str):
            for issue in report.relational_issues:
                if issue_type_keyword.lower() in str(issue.get("issue_type", "")).lower():
                    return issue
            return None

        orphan_issue = find_relational_issue("orphan") or find_relational_issue("foreign_key")
        if orphan_issue:
            acc = self.count_accuracy(int(orphan_issue.get("count", 0)), int(gold["orphaned_order_count"]))
            conf = float(orphan_issue.get("confidence", 0.5))
            scores["orphans"] = self.brier_adjust(acc, conf, acc > 0.5)
        else:
            scores["orphans"] = 0.0

        temp_issue = find_relational_issue("temporal") or find_relational_issue("ship_date")
        if temp_issue:
            acc = self.count_accuracy(int(temp_issue.get("count", 0)), int(gold["temporal_violation_count"]))
            conf = float(temp_issue.get("confidence", 0.5))
            scores["temporal"] = self.brier_adjust(acc, conf, acc > 0.5)
        else:
            scores["temporal"] = 0.0

        agg_issue = find_relational_issue("aggregate") or find_relational_issue("mismatch") or find_relational_issue("total")
        if agg_issue:
            acc = self.count_accuracy(int(agg_issue.get("count", 0)), int(gold["aggregate_mismatch_count"]))
            conf = float(agg_issue.get("confidence", 0.5))
            scores["aggregates"] = self.brier_adjust(acc, conf, acc > 0.5)
        else:
            scores["aggregates"] = 0.0

        scores = {k: self.strict_score(v) for k, v in scores.items()}

        weights = {"orphans": 0.40, "temporal": 0.35, "aggregates": 0.25}
        total = sum(scores[k] * weights[k] for k in weights)
        return self.strict_score(round(total, 4)), scores