File size: 3,070 Bytes
f60a6c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from dataclasses import dataclass, fields, asdict, field
from typing import Any

from numpy import ndarray

from scorevision.chute_template.schemas import SVFrameResult
from scorevision.chute_template.schemas import TVPredictInput
from scorevision.vlm_pipeline.domain_specific_schemas.challenge_types import (
    ChallengeType,
)


@dataclass
class Evaluation:
    @property
    def average(self) -> float:
        values = [float(getattr(self, f.name)) for f in fields(self)]
        return sum(values) / len(values) if values else 0.0

    def __float__(self) -> float:
        return self.average

    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class KeypointsScore(Evaluation):
    floor_markings_alignment: float = (
        0.0  # How correct are the keypoint detections based on the alignment of the transformed floor lines?
    )


@dataclass
class ActionScore(Evaluation):
    categorisation: float = (
        0.0  # How correct are the action labels for the scene compared to the Pseudo GT annotations?
    )


@dataclass
class ObjectsScore(Evaluation):
    bbox_placement: float = (
        0.0  # How correct are the objects compared with the PseudoGT annotations (IoU)?
    )
    categorisation: float = (
        0.0  # How correctly are the objects categorised compared with the PseudoGT annotations (i.e. player, ball)?
    )
    team: float = (
        0.0  # How correctly are the teams categorised compared with the PseudoGT annotations?
    )
    enumeration: float = (
        0.0  # How correct are the number of objects detected compared with the PseudoGT annotations?
    )
    tracking_stability: float = (
        0.0  # How stable/smooth are these object detections across the video
    )


@dataclass
class LatencyScore(Evaluation):
    inference: float = (
        0.0  # How quickly does the miner take to produce predictions for the video (1/2**t)
    )


@dataclass
class TotalScore(Evaluation):
    action: ActionScore = field(default_factory=ActionScore)
    keypoints: KeypointsScore = field(default_factory=KeypointsScore)
    objects: ObjectsScore = field(default_factory=ObjectsScore)
    latency: LatencyScore = field(default_factory=LatencyScore)


@dataclass
class SVChallenge:
    env: str
    payload: TVPredictInput
    meta: dict[str, Any]
    prompt: str
    challenge_id: str
    frame_numbers: list[int]
    frames: list[ndarray]
    dense_optical_flow_frames: list[ndarray]
    api_task_id: str | int | None = None
    challenge_type: ChallengeType | None = None


@dataclass
class SVRunOutput:
    success: bool
    latency_ms: float
    predictions: dict[str, list[SVFrameResult]] | None
    error: str | None
    model: str | None = None


@dataclass
class SVPredictResult:
    success: bool
    model: str | None
    latency_seconds: float
    predictions: dict[str, Any] | None
    error: str | None
    raw: dict[str, Any] | None = None


@dataclass
class SVEvaluation:
    acc_breakdown: dict[str, float]
    acc: float
    latency_ms: float
    score: float
    details: dict[str, Any]