Spaces:

Roopalgn
/

AIHack-ITHelpDesk

Running

App Files Files Community

Roopalgn commited on Apr 3

Commit

ae36543

1 Parent(s): 375aa81

Add grader and dataset unit tests with scoring contract

Browse files

Files changed (4) hide show

analysis/scoring_contract.md +71 -0
tests/openenv_test_stubs.py +38 -0
tests/test_grader_unit.py +127 -0
tests/test_tasks_unit.py +111 -0

analysis/scoring_contract.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Scoring Contract
+> Internal note for test design and scorer review
+## Goal
+Make the helpdesk grader deterministic, defensible, and only fuzzy where we can explain why.
+## Exact-Match-Only Fields
+These fields should never receive partial credit:
+- `assignment_group`
+- `resolution_action`
+If either is wrong, the field score should be exactly `0.0`.
+## Limited Partial-Credit Fields
+### `issue_type`
+`issue_type` can receive partial credit only for explicitly listed near-miss pairs in `server/grader.py`.
+Implications:
+- exact match = `1.0`
+- listed near miss = configured partial score
+- unlisted wrong label = `0.0`
+There should be no hidden semantic fuzziness beyond the declared similarity map.
+### `priority`
+`priority` can receive partial credit only for explicitly listed adjacency / proximity pairs in `server/grader.py`.
+Implications:
+- exact match = `1.0`
+- defined nearby priority = configured partial score
+- undefined mismatch = `0.0`
+## Task Weight Contract
+- Task 1: `issue_type` only
+- Task 2: `issue_type` 60%, `priority` 40%
+- Task 3:
+  - `issue_type` 35%
+  - `priority` 20%
+  - `assignment_group` 25%
+  - `resolution_action` 20%
+The weighted score should always stay in `[0.0, 1.0]`.
+## What The Tests Must Prove
+1. exact matches score `1.0`
+2. unsupported task IDs fail clearly
+3. only intended issue-type pairs get partial credit
+4. unrelated issue types get `0.0`
+5. priority proximity follows the declared table exactly
+6. assignment group and resolution action remain exact-only
+7. task weights apply exactly as documented
+8. dataset loading stays robust, including UTF-8 BOM handling
+## Review Rule
+Before adding any new similarity pair:
+1. justify it with a real-world ticket ambiguity
+2. make sure it does not blur clearly distinct operational actions
+3. add or update a test that proves the intended behavior

tests/openenv_test_stubs.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from __future__ import annotations
+import sys
+import types
+from pydantic import BaseModel
+def install_openenv_type_stubs() -> None:
+    openenv_module = types.ModuleType("openenv")
+    core_module = types.ModuleType("openenv.core")
+    env_server_module = types.ModuleType("openenv.core.env_server")
+    types_module = types.ModuleType("openenv.core.env_server.types")
+    class Action(BaseModel):
+        pass
+    class Observation(BaseModel):
+        pass
+    class State(BaseModel):
+        pass
+    types_module.Action = Action
+    types_module.Observation = Observation
+    types_module.State = State
+    env_server_module.types = types_module
+    core_module.env_server = env_server_module
+    openenv_module.core = core_module
+    sys.modules["openenv"] = openenv_module
+    sys.modules["openenv.core"] = core_module
+    sys.modules["openenv.core.env_server"] = env_server_module
+    sys.modules["openenv.core.env_server.types"] = types_module
+install_openenv_type_stubs()

tests/test_grader_unit.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from __future__ import annotations
+import unittest
+import openenv_test_stubs  # noqa: F401
+from models import HelpdeskTicketAction, HelpdeskTicketRecord
+from server.grader import grade_action
+def _ticket(
+    *,
+    issue_type: str = "billing_license",
+    priority: str = "high",
+    assignment_group: str = "license_ops",
+    resolution_action: str = "fulfill",
+) -> HelpdeskTicketRecord:
+    return HelpdeskTicketRecord(
+        ticket_id="ticket-test",
+        title="Test ticket",
+        requester="user@example.com",
+        description="Synthetic ticket used for deterministic grader tests.",
+        issue_type=issue_type,
+        priority=priority,
+        assignment_group=assignment_group,
+        resolution_action=resolution_action,
+    )
+class GraderUnitTests(unittest.TestCase):
+    def test_task_3_exact_match_scores_one(self) -> None:
+        ticket = _ticket()
+        action = HelpdeskTicketAction(
+            issue_type="billing_license",
+            priority="high",
+            assignment_group="license_ops",
+            resolution_action="fulfill",
+        )
+        score, breakdown = grade_action(action, ticket, task_id=3)
+        self.assertAlmostEqual(score, 1.0)
+        self.assertEqual(
+            breakdown,
+            {
+                "issue_type": 1.0,
+                "priority": 1.0,
+                "assignment_group": 1.0,
+                "resolution_action": 1.0,
+            },
+        )
+    def test_unknown_task_id_raises(self) -> None:
+        ticket = _ticket()
+        action = HelpdeskTicketAction(issue_type="billing_license")
+        with self.assertRaisesRegex(ValueError, "Unsupported task_id"):
+            grade_action(action, ticket, task_id=99)
+    def test_issue_type_partial_credit_only_for_known_similarity_pair(self) -> None:
+        ticket = _ticket(issue_type="billing_license")
+        action = HelpdeskTicketAction(issue_type="service_request")
+        score, breakdown = grade_action(action, ticket, task_id=1)
+        self.assertAlmostEqual(score, 0.4)
+        self.assertEqual(breakdown, {"issue_type": 0.4})
+    def test_unrelated_issue_type_gets_zero_not_fuzzy_credit(self) -> None:
+        ticket = _ticket(issue_type="onboarding")
+        action = HelpdeskTicketAction(issue_type="spam_phishing")
+        score, breakdown = grade_action(action, ticket, task_id=1)
+        self.assertAlmostEqual(score, 0.0)
+        self.assertEqual(breakdown, {"issue_type": 0.0})
+    def test_priority_scoring_uses_defined_proximity_table(self) -> None:
+        ticket = _ticket(priority="critical")
+        action = HelpdeskTicketAction(issue_type="billing_license", priority="high")
+        score, breakdown = grade_action(action, ticket, task_id=2)
+        self.assertAlmostEqual(breakdown["issue_type"], 1.0)
+        self.assertAlmostEqual(breakdown["priority"], 0.6)
+        self.assertAlmostEqual(score, 0.84)
+    def test_task_2_weights_apply_as_documented(self) -> None:
+        ticket = _ticket(priority="high")
+        action = HelpdeskTicketAction(issue_type="billing_license", priority="medium")
+        score, breakdown = grade_action(action, ticket, task_id=2)
+        self.assertEqual(breakdown, {"issue_type": 1.0, "priority": 0.5})
+        self.assertAlmostEqual(score, 0.8)
+    def test_assignment_group_is_exact_match_only(self) -> None:
+        ticket = _ticket()
+        action = HelpdeskTicketAction(
+            issue_type="billing_license",
+            priority="high",
+            assignment_group="service_desk",
+            resolution_action="fulfill",
+        )
+        score, breakdown = grade_action(action, ticket, task_id=3)
+        self.assertEqual(breakdown["assignment_group"], 0.0)
+        self.assertAlmostEqual(score, 0.75)
+    def test_resolution_action_is_exact_match_only(self) -> None:
+        ticket = _ticket()
+        action = HelpdeskTicketAction(
+            issue_type="billing_license",
+            priority="high",
+            assignment_group="license_ops",
+            resolution_action="assign",
+        )
+        score, breakdown = grade_action(action, ticket, task_id=3)
+        self.assertEqual(breakdown["resolution_action"], 0.0)
+        self.assertAlmostEqual(score, 0.8)
+if __name__ == "__main__":
+    unittest.main()

tests/test_tasks_unit.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from __future__ import annotations
+import io
+import unittest
+from unittest import mock
+import openenv_test_stubs  # noqa: F401
+from models import HelpdeskTicketRecord
+from server import tasks as task_module
+from server.tasks import TASKS, get_task_definition, load_dataset
+from vocabulary import TASK_IDS
+class TasksAndDatasetUnitTests(unittest.TestCase):
+    def test_task_ids_match_frozen_contract(self) -> None:
+        self.assertEqual(tuple(TASKS.keys()), TASK_IDS)
+    def test_task_allowed_fields_match_expected_ladder(self) -> None:
+        self.assertEqual(get_task_definition(1)["allowed_fields"], ["issue_type"])
+        self.assertEqual(
+            get_task_definition(2)["allowed_fields"], ["issue_type", "priority"]
+        )
+        self.assertEqual(
+            get_task_definition(3)["allowed_fields"],
+            [
+                "issue_type",
+                "priority",
+                "assignment_group",
+                "resolution_action",
+            ],
+        )
+    def test_invalid_task_id_raises(self) -> None:
+        with self.assertRaisesRegex(ValueError, "Unsupported task_id"):
+            get_task_definition(0)
+    def test_load_dataset_returns_valid_records(self) -> None:
+        dataset = load_dataset()
+        self.assertEqual(len(dataset), 45)
+        self.assertTrue(all(isinstance(record, HelpdeskTicketRecord) for record in dataset))
+    def test_dataset_ticket_ids_are_unique(self) -> None:
+        dataset = load_dataset()
+        ticket_ids = [record.ticket_id for record in dataset]
+        self.assertEqual(len(ticket_ids), len(set(ticket_ids)))
+    def test_related_ticket_ids_reference_existing_records(self) -> None:
+        dataset = load_dataset()
+        ticket_ids = {record.ticket_id for record in dataset}
+        missing_links = [
+            record.related_ticket_id
+            for record in dataset
+            if record.related_ticket_id is not None
+            and record.related_ticket_id not in ticket_ids
+        ]
+        self.assertEqual(missing_links, [])
+    def test_dataset_covers_all_defined_issue_types(self) -> None:
+        dataset = load_dataset()
+        issue_types = {record.issue_type for record in dataset}
+        self.assertEqual(
+            issue_types,
+            {
+                "application_support",
+                "billing_license",
+                "feature_request",
+                "general_inquiry",
+                "identity_access",
+                "onboarding",
+                "security_compliance",
+                "service_request",
+                "spam_phishing",
+            },
+        )
+    def test_load_dataset_accepts_utf8_bom(self) -> None:
+        sample = (
+            b"\xef\xbb\xbf"
+            b"["
+            b"{"
+            b'"ticket_id":"ticket-bom",'
+            b'"title":"BOM test",'
+            b'"requester":"user@example.com",'
+            b'"description":"Dataset loader should tolerate UTF-8 BOM.",'
+            b'"issue_type":"general_inquiry",'
+            b'"priority":"low",'
+            b'"assignment_group":"service_desk",'
+            b'"resolution_action":"acknowledge",'
+            b'"ambiguity_note":null,'
+            b'"related_ticket_id":null'
+            b"}"
+            b"]"
+        )
+        def fake_open(self, mode="r", encoding=None):  # type: ignore[no-untyped-def]
+            return io.TextIOWrapper(io.BytesIO(sample), encoding=encoding)
+        with mock.patch.object(task_module.Path, "open", fake_open):
+            dataset = load_dataset()
+        self.assertEqual([record.ticket_id for record in dataset], ["ticket-bom"])
+if __name__ == "__main__":
+    unittest.main()