Roopalgn commited on
Commit
ae36543
·
1 Parent(s): 375aa81

Add grader and dataset unit tests with scoring contract

Browse files
analysis/scoring_contract.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scoring Contract
2
+
3
+ > Internal note for test design and scorer review
4
+
5
+ ## Goal
6
+
7
+ Make the helpdesk grader deterministic, defensible, and only fuzzy where we can explain why.
8
+
9
+ ## Exact-Match-Only Fields
10
+
11
+ These fields should never receive partial credit:
12
+
13
+ - `assignment_group`
14
+ - `resolution_action`
15
+
16
+ If either is wrong, the field score should be exactly `0.0`.
17
+
18
+ ## Limited Partial-Credit Fields
19
+
20
+ ### `issue_type`
21
+
22
+ `issue_type` can receive partial credit only for explicitly listed near-miss pairs in `server/grader.py`.
23
+
24
+ Implications:
25
+
26
+ - exact match = `1.0`
27
+ - listed near miss = configured partial score
28
+ - unlisted wrong label = `0.0`
29
+
30
+ There should be no hidden semantic fuzziness beyond the declared similarity map.
31
+
32
+ ### `priority`
33
+
34
+ `priority` can receive partial credit only for explicitly listed adjacency / proximity pairs in `server/grader.py`.
35
+
36
+ Implications:
37
+
38
+ - exact match = `1.0`
39
+ - defined nearby priority = configured partial score
40
+ - undefined mismatch = `0.0`
41
+
42
+ ## Task Weight Contract
43
+
44
+ - Task 1: `issue_type` only
45
+ - Task 2: `issue_type` 60%, `priority` 40%
46
+ - Task 3:
47
+ - `issue_type` 35%
48
+ - `priority` 20%
49
+ - `assignment_group` 25%
50
+ - `resolution_action` 20%
51
+
52
+ The weighted score should always stay in `[0.0, 1.0]`.
53
+
54
+ ## What The Tests Must Prove
55
+
56
+ 1. exact matches score `1.0`
57
+ 2. unsupported task IDs fail clearly
58
+ 3. only intended issue-type pairs get partial credit
59
+ 4. unrelated issue types get `0.0`
60
+ 5. priority proximity follows the declared table exactly
61
+ 6. assignment group and resolution action remain exact-only
62
+ 7. task weights apply exactly as documented
63
+ 8. dataset loading stays robust, including UTF-8 BOM handling
64
+
65
+ ## Review Rule
66
+
67
+ Before adding any new similarity pair:
68
+
69
+ 1. justify it with a real-world ticket ambiguity
70
+ 2. make sure it does not blur clearly distinct operational actions
71
+ 3. add or update a test that proves the intended behavior
tests/openenv_test_stubs.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ import types
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ def install_openenv_type_stubs() -> None:
10
+ openenv_module = types.ModuleType("openenv")
11
+ core_module = types.ModuleType("openenv.core")
12
+ env_server_module = types.ModuleType("openenv.core.env_server")
13
+ types_module = types.ModuleType("openenv.core.env_server.types")
14
+
15
+ class Action(BaseModel):
16
+ pass
17
+
18
+ class Observation(BaseModel):
19
+ pass
20
+
21
+ class State(BaseModel):
22
+ pass
23
+
24
+ types_module.Action = Action
25
+ types_module.Observation = Observation
26
+ types_module.State = State
27
+
28
+ env_server_module.types = types_module
29
+ core_module.env_server = env_server_module
30
+ openenv_module.core = core_module
31
+
32
+ sys.modules["openenv"] = openenv_module
33
+ sys.modules["openenv.core"] = core_module
34
+ sys.modules["openenv.core.env_server"] = env_server_module
35
+ sys.modules["openenv.core.env_server.types"] = types_module
36
+
37
+
38
+ install_openenv_type_stubs()
tests/test_grader_unit.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import unittest
4
+
5
+ import openenv_test_stubs # noqa: F401
6
+
7
+ from models import HelpdeskTicketAction, HelpdeskTicketRecord
8
+ from server.grader import grade_action
9
+
10
+
11
+ def _ticket(
12
+ *,
13
+ issue_type: str = "billing_license",
14
+ priority: str = "high",
15
+ assignment_group: str = "license_ops",
16
+ resolution_action: str = "fulfill",
17
+ ) -> HelpdeskTicketRecord:
18
+ return HelpdeskTicketRecord(
19
+ ticket_id="ticket-test",
20
+ title="Test ticket",
21
+ requester="user@example.com",
22
+ description="Synthetic ticket used for deterministic grader tests.",
23
+ issue_type=issue_type,
24
+ priority=priority,
25
+ assignment_group=assignment_group,
26
+ resolution_action=resolution_action,
27
+ )
28
+
29
+
30
+ class GraderUnitTests(unittest.TestCase):
31
+ def test_task_3_exact_match_scores_one(self) -> None:
32
+ ticket = _ticket()
33
+ action = HelpdeskTicketAction(
34
+ issue_type="billing_license",
35
+ priority="high",
36
+ assignment_group="license_ops",
37
+ resolution_action="fulfill",
38
+ )
39
+
40
+ score, breakdown = grade_action(action, ticket, task_id=3)
41
+
42
+ self.assertAlmostEqual(score, 1.0)
43
+ self.assertEqual(
44
+ breakdown,
45
+ {
46
+ "issue_type": 1.0,
47
+ "priority": 1.0,
48
+ "assignment_group": 1.0,
49
+ "resolution_action": 1.0,
50
+ },
51
+ )
52
+
53
+ def test_unknown_task_id_raises(self) -> None:
54
+ ticket = _ticket()
55
+ action = HelpdeskTicketAction(issue_type="billing_license")
56
+
57
+ with self.assertRaisesRegex(ValueError, "Unsupported task_id"):
58
+ grade_action(action, ticket, task_id=99)
59
+
60
+ def test_issue_type_partial_credit_only_for_known_similarity_pair(self) -> None:
61
+ ticket = _ticket(issue_type="billing_license")
62
+ action = HelpdeskTicketAction(issue_type="service_request")
63
+
64
+ score, breakdown = grade_action(action, ticket, task_id=1)
65
+
66
+ self.assertAlmostEqual(score, 0.4)
67
+ self.assertEqual(breakdown, {"issue_type": 0.4})
68
+
69
+ def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
70
+ ticket = _ticket(issue_type="onboarding")
71
+ action = HelpdeskTicketAction(issue_type="spam_phishing")
72
+
73
+ score, breakdown = grade_action(action, ticket, task_id=1)
74
+
75
+ self.assertAlmostEqual(score, 0.0)
76
+ self.assertEqual(breakdown, {"issue_type": 0.0})
77
+
78
+ def test_priority_scoring_uses_defined_proximity_table(self) -> None:
79
+ ticket = _ticket(priority="critical")
80
+ action = HelpdeskTicketAction(issue_type="billing_license", priority="high")
81
+
82
+ score, breakdown = grade_action(action, ticket, task_id=2)
83
+
84
+ self.assertAlmostEqual(breakdown["issue_type"], 1.0)
85
+ self.assertAlmostEqual(breakdown["priority"], 0.6)
86
+ self.assertAlmostEqual(score, 0.84)
87
+
88
+ def test_task_2_weights_apply_as_documented(self) -> None:
89
+ ticket = _ticket(priority="high")
90
+ action = HelpdeskTicketAction(issue_type="billing_license", priority="medium")
91
+
92
+ score, breakdown = grade_action(action, ticket, task_id=2)
93
+
94
+ self.assertEqual(breakdown, {"issue_type": 1.0, "priority": 0.5})
95
+ self.assertAlmostEqual(score, 0.8)
96
+
97
+ def test_assignment_group_is_exact_match_only(self) -> None:
98
+ ticket = _ticket()
99
+ action = HelpdeskTicketAction(
100
+ issue_type="billing_license",
101
+ priority="high",
102
+ assignment_group="service_desk",
103
+ resolution_action="fulfill",
104
+ )
105
+
106
+ score, breakdown = grade_action(action, ticket, task_id=3)
107
+
108
+ self.assertEqual(breakdown["assignment_group"], 0.0)
109
+ self.assertAlmostEqual(score, 0.75)
110
+
111
+ def test_resolution_action_is_exact_match_only(self) -> None:
112
+ ticket = _ticket()
113
+ action = HelpdeskTicketAction(
114
+ issue_type="billing_license",
115
+ priority="high",
116
+ assignment_group="license_ops",
117
+ resolution_action="assign",
118
+ )
119
+
120
+ score, breakdown = grade_action(action, ticket, task_id=3)
121
+
122
+ self.assertEqual(breakdown["resolution_action"], 0.0)
123
+ self.assertAlmostEqual(score, 0.8)
124
+
125
+
126
+ if __name__ == "__main__":
127
+ unittest.main()
tests/test_tasks_unit.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import unittest
5
+ from unittest import mock
6
+
7
+ import openenv_test_stubs # noqa: F401
8
+
9
+ from models import HelpdeskTicketRecord
10
+ from server import tasks as task_module
11
+ from server.tasks import TASKS, get_task_definition, load_dataset
12
+ from vocabulary import TASK_IDS
13
+
14
+
15
+ class TasksAndDatasetUnitTests(unittest.TestCase):
16
+ def test_task_ids_match_frozen_contract(self) -> None:
17
+ self.assertEqual(tuple(TASKS.keys()), TASK_IDS)
18
+
19
+ def test_task_allowed_fields_match_expected_ladder(self) -> None:
20
+ self.assertEqual(get_task_definition(1)["allowed_fields"], ["issue_type"])
21
+ self.assertEqual(
22
+ get_task_definition(2)["allowed_fields"], ["issue_type", "priority"]
23
+ )
24
+ self.assertEqual(
25
+ get_task_definition(3)["allowed_fields"],
26
+ [
27
+ "issue_type",
28
+ "priority",
29
+ "assignment_group",
30
+ "resolution_action",
31
+ ],
32
+ )
33
+
34
+ def test_invalid_task_id_raises(self) -> None:
35
+ with self.assertRaisesRegex(ValueError, "Unsupported task_id"):
36
+ get_task_definition(0)
37
+
38
+ def test_load_dataset_returns_valid_records(self) -> None:
39
+ dataset = load_dataset()
40
+
41
+ self.assertEqual(len(dataset), 45)
42
+ self.assertTrue(all(isinstance(record, HelpdeskTicketRecord) for record in dataset))
43
+
44
+ def test_dataset_ticket_ids_are_unique(self) -> None:
45
+ dataset = load_dataset()
46
+ ticket_ids = [record.ticket_id for record in dataset]
47
+
48
+ self.assertEqual(len(ticket_ids), len(set(ticket_ids)))
49
+
50
+ def test_related_ticket_ids_reference_existing_records(self) -> None:
51
+ dataset = load_dataset()
52
+ ticket_ids = {record.ticket_id for record in dataset}
53
+
54
+ missing_links = [
55
+ record.related_ticket_id
56
+ for record in dataset
57
+ if record.related_ticket_id is not None
58
+ and record.related_ticket_id not in ticket_ids
59
+ ]
60
+
61
+ self.assertEqual(missing_links, [])
62
+
63
+ def test_dataset_covers_all_defined_issue_types(self) -> None:
64
+ dataset = load_dataset()
65
+ issue_types = {record.issue_type for record in dataset}
66
+
67
+ self.assertEqual(
68
+ issue_types,
69
+ {
70
+ "application_support",
71
+ "billing_license",
72
+ "feature_request",
73
+ "general_inquiry",
74
+ "identity_access",
75
+ "onboarding",
76
+ "security_compliance",
77
+ "service_request",
78
+ "spam_phishing",
79
+ },
80
+ )
81
+
82
+ def test_load_dataset_accepts_utf8_bom(self) -> None:
83
+ sample = (
84
+ b"\xef\xbb\xbf"
85
+ b"["
86
+ b"{"
87
+ b'"ticket_id":"ticket-bom",'
88
+ b'"title":"BOM test",'
89
+ b'"requester":"user@example.com",'
90
+ b'"description":"Dataset loader should tolerate UTF-8 BOM.",'
91
+ b'"issue_type":"general_inquiry",'
92
+ b'"priority":"low",'
93
+ b'"assignment_group":"service_desk",'
94
+ b'"resolution_action":"acknowledge",'
95
+ b'"ambiguity_note":null,'
96
+ b'"related_ticket_id":null'
97
+ b"}"
98
+ b"]"
99
+ )
100
+
101
+ def fake_open(self, mode="r", encoding=None): # type: ignore[no-untyped-def]
102
+ return io.TextIOWrapper(io.BytesIO(sample), encoding=encoding)
103
+
104
+ with mock.patch.object(task_module.Path, "open", fake_open):
105
+ dataset = load_dataset()
106
+
107
+ self.assertEqual([record.ticket_id for record in dataset], ["ticket-bom"])
108
+
109
+
110
+ if __name__ == "__main__":
111
+ unittest.main()