mitudrudutta commited on
Commit
ea03c8c
·
0 Parent(s):

feat: add core chargeback environment

Browse files
__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ChargebackOps OpenEnv package."""
2
+
3
+ from .client import ChargebackOpsEnv
4
+ from .models import (
5
+ BaselineRunResult,
6
+ ChargebackOpsAction,
7
+ ChargebackOpsObservation,
8
+ ChargebackOpsState,
9
+ GraderReport,
10
+ )
11
+
12
+ __all__ = [
13
+ "BaselineRunResult",
14
+ "ChargebackOpsAction",
15
+ "ChargebackOpsEnv",
16
+ "ChargebackOpsObservation",
17
+ "ChargebackOpsState",
18
+ "GraderReport",
19
+ ]
client.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """WebSocket client for ChargebackOps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from openenv.core import EnvClient
8
+ from openenv.core.client_types import StepResult
9
+
10
+ from .models import (
11
+ ActionTraceItem,
12
+ BaselineRunResult,
13
+ CaseQueueItem,
14
+ CaseResolutionState,
15
+ ChargebackOpsAction,
16
+ ChargebackOpsObservation,
17
+ ChargebackOpsState,
18
+ EvidenceCard,
19
+ GraderReport,
20
+ PolicyView,
21
+ VisibleCase,
22
+ )
23
+
24
+
25
+ def _parse_evidence(payload: dict[str, Any]) -> EvidenceCard:
26
+ return EvidenceCard(**payload)
27
+
28
+
29
+ def _parse_policy(payload: dict[str, Any] | None) -> PolicyView | None:
30
+ if payload is None:
31
+ return None
32
+ return PolicyView(**payload)
33
+
34
+
35
+ def _parse_visible_case(payload: dict[str, Any] | None) -> VisibleCase | None:
36
+ if payload is None:
37
+ return None
38
+ data = dict(payload)
39
+ data["retrieved_evidence"] = [
40
+ _parse_evidence(item) for item in data.get("retrieved_evidence", [])
41
+ ]
42
+ data["attached_evidence"] = [
43
+ _parse_evidence(item) for item in data.get("attached_evidence", [])
44
+ ]
45
+ data["policy"] = _parse_policy(data.get("policy"))
46
+ return VisibleCase(**data)
47
+
48
+
49
+ def _parse_grader(payload: dict[str, Any] | None) -> GraderReport | None:
50
+ if payload is None:
51
+ return None
52
+ return GraderReport(**payload)
53
+
54
+
55
+ class ChargebackOpsEnv(
56
+ EnvClient[ChargebackOpsAction, ChargebackOpsObservation, ChargebackOpsState]
57
+ ):
58
+ """Typed client for the ChargebackOps environment."""
59
+
60
+ def _step_payload(self, action: ChargebackOpsAction) -> dict[str, Any]:
61
+ return action.model_dump()
62
+
63
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[ChargebackOpsObservation]:
64
+ obs_data = dict(payload.get("observation", {}))
65
+ obs_data["queue"] = [CaseQueueItem(**item) for item in obs_data.get("queue", [])]
66
+ obs_data["visible_case"] = _parse_visible_case(obs_data.get("visible_case"))
67
+ obs_data["grader_report"] = _parse_grader(obs_data.get("grader_report"))
68
+ observation = ChargebackOpsObservation(
69
+ **obs_data,
70
+ done=payload.get("done", False),
71
+ reward=payload.get("reward"),
72
+ )
73
+ return StepResult(
74
+ observation=observation,
75
+ reward=payload.get("reward"),
76
+ done=payload.get("done", False),
77
+ )
78
+
79
+ def _parse_state(self, payload: dict[str, Any]) -> ChargebackOpsState:
80
+ data = dict(payload)
81
+ data["queue_state"] = [
82
+ CaseResolutionState(**item) for item in data.get("queue_state", [])
83
+ ]
84
+ data["action_history"] = [
85
+ ActionTraceItem(**item) for item in data.get("action_history", [])
86
+ ]
87
+ data["grader_report"] = _parse_grader(data.get("grader_report"))
88
+ return ChargebackOpsState(**data)
episode_store.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Thread-safe storage for completed episode grading reports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from threading import Lock
6
+
7
+ try:
8
+ from .models import GraderReport
9
+ except ImportError: # pragma: no cover
10
+ from models import GraderReport
11
+
12
+ _LOCK = Lock()
13
+ _REPORTS: dict[str, GraderReport] = {}
14
+ _LATEST_EPISODE_ID: str | None = None
15
+
16
+
17
+ def record_report(report: GraderReport) -> None:
18
+ """Store a finished grading report."""
19
+
20
+ global _LATEST_EPISODE_ID
21
+ with _LOCK:
22
+ _REPORTS[report.episode_id] = report
23
+ _LATEST_EPISODE_ID = report.episode_id
24
+
25
+
26
+ def get_report(episode_id: str | None = None) -> GraderReport | None:
27
+ """Return a report by id or the latest completed one."""
28
+
29
+ with _LOCK:
30
+ key = episode_id or _LATEST_EPISODE_ID
31
+ if key is None:
32
+ return None
33
+ return _REPORTS.get(key)
grading.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic grading logic for ChargebackOps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ try:
6
+ from .models import CaseScoreBreakdown, GraderReport
7
+ from .simulation import CaseProgress, InternalCase, TaskScenario
8
+ except ImportError: # pragma: no cover
9
+ from models import CaseScoreBreakdown, GraderReport
10
+ from simulation import CaseProgress, InternalCase, TaskScenario
11
+
12
+
13
+ def _ratio(numerator: int, denominator: int) -> float:
14
+ if denominator <= 0:
15
+ return 1.0
16
+ return max(0.0, min(1.0, numerator / denominator))
17
+
18
+
19
+ def score_case(
20
+ case: InternalCase,
21
+ progress: CaseProgress,
22
+ step_count: int,
23
+ ) -> CaseScoreBreakdown:
24
+ """Score one case deterministically."""
25
+
26
+ final_resolution = progress.final_resolution or "unresolved"
27
+ required_attached = len(
28
+ set(progress.attached_evidence_ids).intersection(case.required_evidence_ids)
29
+ )
30
+ helpful_attached = len(
31
+ set(progress.attached_evidence_ids).intersection(case.helpful_evidence_ids)
32
+ )
33
+ harmful_attached = len(
34
+ set(progress.attached_evidence_ids).intersection(case.harmful_evidence_ids)
35
+ )
36
+
37
+ if final_resolution == case.optimal_strategy:
38
+ strategy_correctness = 1.0
39
+ elif final_resolution in case.acceptable_strategies:
40
+ strategy_correctness = 0.55
41
+ else:
42
+ strategy_correctness = 0.0
43
+
44
+ if final_resolution == "contest":
45
+ base_evidence_quality = 0.7 * _ratio(required_attached, len(case.required_evidence_ids))
46
+ bonus = 0.3 * _ratio(helpful_attached, max(1, len(case.helpful_evidence_ids)))
47
+ penalty = 0.25 * harmful_attached
48
+ evidence_quality = max(0.0, min(1.0, base_evidence_quality + bonus - penalty))
49
+ packet_validity = (
50
+ 1.0
51
+ if required_attached == len(case.required_evidence_ids) and harmful_attached == 0
52
+ else 0.0
53
+ )
54
+ else:
55
+ if final_resolution in {"accept_chargeback", "issue_refund"}:
56
+ evidence_quality = 1.0 if helpful_attached == 0 and harmful_attached == 0 else 0.7
57
+ packet_validity = 1.0
58
+ else:
59
+ evidence_quality = 0.0
60
+ packet_validity = 0.0
61
+
62
+ deadline_compliance = 1.0
63
+ if final_resolution == "unresolved":
64
+ deadline_compliance = 0.0
65
+ elif step_count > case.deadline_step:
66
+ deadline_compliance = 0.0
67
+
68
+ wasted_actions = progress.duplicate_queries + progress.invalid_actions
69
+ efficiency = max(0.0, 1.0 - min(0.9, wasted_actions * 0.1 + progress.submit_attempts * 0.05))
70
+
71
+ if final_resolution == case.optimal_strategy:
72
+ outcome_quality = 1.0
73
+ elif final_resolution in case.acceptable_strategies:
74
+ outcome_quality = 0.6
75
+ else:
76
+ outcome_quality = 0.0
77
+
78
+ weighted_score = (
79
+ 0.25 * strategy_correctness
80
+ + 0.25 * evidence_quality
81
+ + 0.15 * packet_validity
82
+ + 0.15 * deadline_compliance
83
+ + 0.10 * efficiency
84
+ + 0.10 * outcome_quality
85
+ )
86
+
87
+ note_parts = [case.resolution_summary]
88
+ if harmful_attached:
89
+ note_parts.append("Harmful evidence weakened the case.")
90
+ if final_resolution == "unresolved":
91
+ note_parts.append("Case was never resolved.")
92
+ elif step_count > case.deadline_step:
93
+ note_parts.append("Resolution happened after the deadline.")
94
+
95
+ return CaseScoreBreakdown(
96
+ case_id=case.case_id,
97
+ strategy_correctness=round(strategy_correctness, 4),
98
+ evidence_quality=round(evidence_quality, 4),
99
+ packet_validity=round(packet_validity, 4),
100
+ deadline_compliance=round(deadline_compliance, 4),
101
+ efficiency=round(efficiency, 4),
102
+ outcome_quality=round(outcome_quality, 4),
103
+ weighted_score=round(weighted_score * case.weight, 4),
104
+ final_resolution=final_resolution,
105
+ notes=" ".join(note_parts),
106
+ )
107
+
108
+
109
+ def grade_episode(
110
+ task: TaskScenario,
111
+ progress_by_case: dict[str, CaseProgress],
112
+ step_count: int,
113
+ episode_id: str,
114
+ completed: bool,
115
+ ) -> GraderReport:
116
+ """Grade a full episode."""
117
+
118
+ case_reports = [
119
+ score_case(case, progress_by_case[case.case_id], step_count)
120
+ for case in task.cases
121
+ ]
122
+ total_weight = sum(case.weight for case in task.cases)
123
+ total_score = sum(report.weighted_score for report in case_reports)
124
+ normalized = 0.0 if total_weight == 0 else min(1.0, total_score / total_weight)
125
+ summary = (
126
+ f"Resolved {sum(1 for report in case_reports if report.final_resolution != 'unresolved')}/"
127
+ f"{len(case_reports)} cases with normalized score {normalized:.3f}."
128
+ )
129
+ return GraderReport(
130
+ episode_id=episode_id,
131
+ task_id=task.task_id,
132
+ total_score=round(total_score, 4),
133
+ normalized_score=round(normalized, 4),
134
+ completed=completed,
135
+ case_reports=case_reports,
136
+ summary=summary,
137
+ )
models.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed models for the ChargebackOps OpenEnv environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from openenv.core.env_server.types import Action, Observation, State
8
+ from pydantic import BaseModel, Field
9
+
10
+ SystemName = Literal["orders", "payment", "shipping", "support", "refunds", "risk"]
11
+ StrategyName = Literal["contest", "accept_chargeback", "issue_refund"]
12
+ ActionType = Literal[
13
+ "select_case",
14
+ "inspect_case",
15
+ "query_system",
16
+ "retrieve_policy",
17
+ "add_evidence",
18
+ "remove_evidence",
19
+ "set_strategy",
20
+ "submit_representment",
21
+ "resolve_case",
22
+ ]
23
+
24
+
25
+ class CaseQueueItem(BaseModel):
26
+ """Queue-level summary of a chargeback case."""
27
+
28
+ case_id: str
29
+ amount: float
30
+ currency: str
31
+ reason_code: str
32
+ status: str
33
+ summary: str
34
+ deadline_step: int
35
+ steps_until_deadline: int
36
+
37
+
38
+ class EvidenceCard(BaseModel):
39
+ """Evidence snippet visible to the agent."""
40
+
41
+ evidence_id: str
42
+ source_system: SystemName
43
+ title: str
44
+ summary: str
45
+ attached: bool = False
46
+
47
+
48
+ class PolicyView(BaseModel):
49
+ """Visible reason-code policy guidance."""
50
+
51
+ reason_code: str
52
+ guidance: str
53
+ required_evidence: list[str] = Field(default_factory=list)
54
+ recommended_strategy: StrategyName
55
+
56
+
57
+ class VisibleCase(BaseModel):
58
+ """Current workspace for the selected case."""
59
+
60
+ case_id: str
61
+ order_id: str
62
+ customer_id: str
63
+ amount: float
64
+ currency: str
65
+ reason_code: str
66
+ status: str
67
+ current_strategy: StrategyName | None = None
68
+ summary: str
69
+ inspection_notes: str | None = None
70
+ systems_revealed: list[SystemName] = Field(default_factory=list)
71
+ retrieved_evidence: list[EvidenceCard] = Field(default_factory=list)
72
+ attached_evidence: list[EvidenceCard] = Field(default_factory=list)
73
+ policy: PolicyView | None = None
74
+ submission_status: str | None = None
75
+
76
+
77
+ class TaskSummary(BaseModel):
78
+ """Metadata for a built-in task."""
79
+
80
+ task_id: str
81
+ title: str
82
+ difficulty: Literal["easy", "medium", "hard"]
83
+ objective: str
84
+ description: str
85
+ max_steps: int
86
+ case_count: int
87
+
88
+
89
+ class ActionTraceItem(BaseModel):
90
+ """Compact action history row."""
91
+
92
+ step_index: int
93
+ action_type: str
94
+ case_id: str | None = None
95
+ outcome: str
96
+ reward: float
97
+
98
+
99
+ class CaseResolutionState(BaseModel):
100
+ """Public case state in the current episode."""
101
+
102
+ case_id: str
103
+ status: str
104
+ current_strategy: StrategyName | None = None
105
+ resolved: bool = False
106
+ steps_until_deadline: int
107
+
108
+
109
+ class CaseScoreBreakdown(BaseModel):
110
+ """Per-case grading breakdown."""
111
+
112
+ case_id: str
113
+ strategy_correctness: float
114
+ evidence_quality: float
115
+ packet_validity: float
116
+ deadline_compliance: float
117
+ efficiency: float
118
+ outcome_quality: float
119
+ weighted_score: float
120
+ final_resolution: str
121
+ notes: str
122
+
123
+
124
+ class GraderReport(BaseModel):
125
+ """Episode-level deterministic grade report."""
126
+
127
+ episode_id: str
128
+ task_id: str
129
+ total_score: float
130
+ normalized_score: float
131
+ completed: bool
132
+ case_reports: list[CaseScoreBreakdown] = Field(default_factory=list)
133
+ summary: str
134
+
135
+
136
+ class BaselineTaskResult(BaseModel):
137
+ """Baseline score for one task."""
138
+
139
+ task_id: str
140
+ title: str
141
+ score: float
142
+ steps_used: int
143
+ final_status: str
144
+
145
+
146
+ class BaselineRunResult(BaseModel):
147
+ """Aggregate baseline result payload."""
148
+
149
+ provider: str
150
+ model_name: str
151
+ mode: str
152
+ provider_calls_attempted: int = 0
153
+ provider_calls_succeeded: int = 0
154
+ provider_errors: dict[str, int] = Field(default_factory=dict)
155
+ task_results: list[BaselineTaskResult]
156
+ average_score: float
157
+
158
+
159
+ class TasksResponse(BaseModel):
160
+ """Payload returned by /tasks."""
161
+
162
+ tasks: list[TaskSummary]
163
+ action_schema: dict[str, Any]
164
+
165
+
166
+ class ChargebackOpsAction(Action):
167
+ """Action schema for ChargebackOps."""
168
+
169
+ action_type: ActionType
170
+ case_id: str | None = Field(default=None, description="Target case id when applicable")
171
+ system_name: SystemName | None = Field(
172
+ default=None,
173
+ description="System to query when action_type is query_system",
174
+ )
175
+ evidence_ids: list[str] = Field(
176
+ default_factory=list,
177
+ description="Evidence ids to attach or remove",
178
+ )
179
+ strategy: StrategyName | None = Field(
180
+ default=None,
181
+ description="Strategy to set or use when resolving a case",
182
+ )
183
+ note: str | None = Field(
184
+ default=None,
185
+ description="Optional short rationale for the action",
186
+ )
187
+
188
+
189
+ class ChargebackOpsObservation(Observation):
190
+ """Observation returned by reset() and step()."""
191
+
192
+ task_id: str
193
+ task_title: str
194
+ difficulty: Literal["easy", "medium", "hard"]
195
+ objective: str
196
+ selected_case_id: str | None = None
197
+ queue: list[CaseQueueItem] = Field(default_factory=list)
198
+ visible_case: VisibleCase | None = None
199
+ last_action_result: str = ""
200
+ available_actions: list[str] = Field(default_factory=list)
201
+ steps_remaining: int
202
+ progress_score: float = 0.0
203
+ grader_report: GraderReport | None = None
204
+
205
+
206
+ class ChargebackOpsState(State):
207
+ """Extended environment state returned by state()."""
208
+
209
+ task_id: str
210
+ task_title: str
211
+ difficulty: Literal["easy", "medium", "hard"]
212
+ objective: str
213
+ selected_case_id: str | None = None
214
+ queue_state: list[CaseResolutionState] = Field(default_factory=list)
215
+ action_history: list[ActionTraceItem] = Field(default_factory=list)
216
+ latest_grade: float | None = None
217
+ grader_report: GraderReport | None = None
218
+ completed: bool = False
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: chargeback_ops
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+ description: Merchant dispute handling and chargeback representment environment
pyproject.toml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-chargeback_ops"
7
+ version = "0.1.0"
8
+ description = "ChargebackOps: a real-world OpenEnv environment for merchant dispute handling."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "anthropic>=0.51.0",
13
+ "fastapi>=0.135.0",
14
+ "openai>=2.30.0",
15
+ "openenv-core[core]>=0.2.2",
16
+ "pydantic>=2.12.0",
17
+ "python-dotenv>=1.0.1",
18
+ "uvicorn>=0.42.0",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = [
23
+ "httpx>=0.28.0",
24
+ "pytest>=8.0.0",
25
+ "pytest-cov>=4.0.0",
26
+ ]
27
+
28
+ [project.scripts]
29
+ server = "chargeback_ops.server.app:main"
30
+ chargebackops-baseline = "chargeback_ops.baseline_runner:main"
31
+
32
+ [tool.setuptools]
33
+ include-package-data = true
34
+ packages = ["chargeback_ops", "chargeback_ops.server"]
35
+ package-dir = { "chargeback_ops" = ".", "chargeback_ops.server" = "server" }
36
+
37
+ [tool.pytest.ini_options]
38
+ testpaths = ["tests"]
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Chargeback Ops environment server components."""
8
+
9
+ from .chargeback_ops_environment import ChargebackOpsEnvironment
10
+
11
+ __all__ = ["ChargebackOpsEnvironment"]
server/chargeback_ops_environment.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core environment implementation for ChargebackOps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from uuid import uuid4
6
+
7
+ from openenv.core.env_server.interfaces import Environment
8
+
9
+ try:
10
+ from ..episode_store import record_report
11
+ from ..grading import grade_episode
12
+ from ..models import (
13
+ ActionTraceItem,
14
+ CaseQueueItem,
15
+ CaseResolutionState,
16
+ ChargebackOpsAction,
17
+ ChargebackOpsObservation,
18
+ ChargebackOpsState,
19
+ EvidenceCard,
20
+ PolicyView,
21
+ VisibleCase,
22
+ )
23
+ from ..simulation import ActionRecord, CaseProgress, InternalCase, get_task
24
+ except ImportError: # pragma: no cover
25
+ from episode_store import record_report
26
+ from grading import grade_episode
27
+ from models import (
28
+ ActionTraceItem,
29
+ CaseQueueItem,
30
+ CaseResolutionState,
31
+ ChargebackOpsAction,
32
+ ChargebackOpsObservation,
33
+ ChargebackOpsState,
34
+ EvidenceCard,
35
+ PolicyView,
36
+ VisibleCase,
37
+ )
38
+ from simulation import ActionRecord, CaseProgress, InternalCase, get_task
39
+
40
+
41
+ class ChargebackOpsEnvironment(
42
+ Environment[ChargebackOpsAction, ChargebackOpsObservation, ChargebackOpsState]
43
+ ):
44
+ """Synthetic merchant chargeback representment environment."""
45
+
46
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
47
+
48
+ def __init__(self):
49
+ super().__init__()
50
+ self._task = get_task("goods_not_received_easy")
51
+ self._selected_case_id: str | None = None
52
+ self._last_action_result = "Environment initialized."
53
+ self._action_history: list[ActionRecord] = []
54
+ self._progress_by_case: dict[str, CaseProgress] = {}
55
+ self._state = ChargebackOpsState(
56
+ episode_id=str(uuid4()),
57
+ step_count=0,
58
+ task_id=self._task.task_id,
59
+ task_title=self._task.title,
60
+ difficulty=self._task.difficulty,
61
+ objective=self._task.objective,
62
+ )
63
+ self._done = False
64
+ self._latest_report = None
65
+ self._reset_task_state()
66
+
67
+ def _reset_task_state(self) -> None:
68
+ self._progress_by_case = {
69
+ case.case_id: CaseProgress() for case in self._task.cases
70
+ }
71
+ self._selected_case_id = None
72
+ self._action_history = []
73
+ self._last_action_result = f"{self._task.title} ready."
74
+ self._done = False
75
+ self._latest_report = None
76
+
77
+ def reset(
78
+ self,
79
+ seed: int | None = None,
80
+ episode_id: str | None = None,
81
+ **kwargs,
82
+ ) -> ChargebackOpsObservation:
83
+ del seed
84
+ task_id = kwargs.get("task_id", "goods_not_received_easy")
85
+ self._task = get_task(task_id)
86
+ self._state = ChargebackOpsState(
87
+ episode_id=episode_id or str(uuid4()),
88
+ step_count=0,
89
+ task_id=self._task.task_id,
90
+ task_title=self._task.title,
91
+ difficulty=self._task.difficulty,
92
+ objective=self._task.objective,
93
+ )
94
+ self._reset_task_state()
95
+ return self._build_observation(reward=0.0, done=False)
96
+
97
+ def step(
98
+ self,
99
+ action: ChargebackOpsAction,
100
+ timeout_s: float | None = None,
101
+ **kwargs,
102
+ ) -> ChargebackOpsObservation:
103
+ del timeout_s, kwargs
104
+ if self._done:
105
+ return self._build_observation(
106
+ reward=-0.1,
107
+ done=True,
108
+ result="Episode already completed. Reset to start another task.",
109
+ )
110
+
111
+ self._state.step_count += 1
112
+ reward = 0.0
113
+ result = ""
114
+
115
+ try:
116
+ reward, result = self._apply_action(action)
117
+ except ValueError as exc:
118
+ reward = -0.12
119
+ result = str(exc)
120
+ case_id = action.case_id or self._selected_case_id
121
+ if case_id and case_id in self._progress_by_case:
122
+ self._progress_by_case[case_id].invalid_actions += 1
123
+
124
+ reward += self._apply_deadline_penalties()
125
+ done = self._check_done()
126
+ if done:
127
+ report = grade_episode(
128
+ self._task,
129
+ self._progress_by_case,
130
+ self._state.step_count,
131
+ self._state.episode_id or "",
132
+ completed=self._all_cases_resolved(),
133
+ )
134
+ self._latest_report = report
135
+ self._state.latest_grade = report.normalized_score
136
+ self._state.grader_report = report
137
+ self._state.completed = True
138
+ record_report(report)
139
+ reward += 0.5 * report.normalized_score
140
+ else:
141
+ self._state.latest_grade = self._estimated_progress_score()
142
+ self._state.completed = False
143
+
144
+ self._last_action_result = result
145
+ self._action_history.append(
146
+ ActionRecord(
147
+ step_index=self._state.step_count,
148
+ action_type=action.action_type,
149
+ case_id=action.case_id or self._selected_case_id,
150
+ outcome=result,
151
+ reward=round(reward, 4),
152
+ )
153
+ )
154
+ return self._build_observation(
155
+ reward=round(reward, 4),
156
+ done=done,
157
+ result=result,
158
+ )
159
+
160
+ def _apply_action(self, action: ChargebackOpsAction) -> tuple[float, str]:
161
+ if action.action_type == "select_case":
162
+ return self._select_case(action.case_id)
163
+ case = self._require_case(action.case_id)
164
+
165
+ if action.action_type == "inspect_case":
166
+ return self._inspect_case(case)
167
+ if action.action_type == "query_system":
168
+ return self._query_system(case, action.system_name)
169
+ if action.action_type == "retrieve_policy":
170
+ return self._retrieve_policy(case)
171
+ if action.action_type == "add_evidence":
172
+ return self._add_evidence(case, action.evidence_ids)
173
+ if action.action_type == "remove_evidence":
174
+ return self._remove_evidence(case, action.evidence_ids)
175
+ if action.action_type == "set_strategy":
176
+ return self._set_strategy(case, action.strategy)
177
+ if action.action_type == "submit_representment":
178
+ return self._submit_representment(case)
179
+ if action.action_type == "resolve_case":
180
+ return self._resolve_case(case, action.strategy)
181
+ raise ValueError(f"Unsupported action_type '{action.action_type}'.")
182
+
183
+ def _select_case(self, case_id: str | None) -> tuple[float, str]:
184
+ if not case_id:
185
+ raise ValueError("select_case requires case_id.")
186
+ case = self._lookup_case(case_id)
187
+ self._selected_case_id = case.case_id
188
+ progress = self._progress_by_case[case.case_id]
189
+ if progress.resolution_status != "open":
190
+ return -0.02, f"Case {case.case_id} is already resolved."
191
+ return 0.02, f"Selected case {case.case_id}."
192
+
193
+ def _inspect_case(self, case: InternalCase) -> tuple[float, str]:
194
+ progress = self._progress_by_case[case.case_id]
195
+ if progress.inspected:
196
+ return -0.01, f"Case {case.case_id} was already inspected."
197
+ progress.inspected = True
198
+ return 0.04, f"Inspected case {case.case_id}."
199
+
200
+ def _query_system(
201
+ self,
202
+ case: InternalCase,
203
+ system_name: str | None,
204
+ ) -> tuple[float, str]:
205
+ if system_name is None:
206
+ raise ValueError("query_system requires system_name.")
207
+ progress = self._progress_by_case[case.case_id]
208
+ if system_name in progress.revealed_systems:
209
+ progress.duplicate_queries += 1
210
+ return -0.03, f"System '{system_name}' was already queried for case {case.case_id}."
211
+
212
+ progress.revealed_systems.add(system_name)
213
+ new_evidence = case.evidence_by_system.get(system_name, ())
214
+ progress.retrieved_evidence_ids.update(item.evidence_id for item in new_evidence)
215
+ helpful = sum(1 for item in new_evidence if item.helpful)
216
+ if helpful > 0:
217
+ return 0.06 + 0.01 * helpful, (
218
+ f"Queried {system_name} for case {case.case_id}; found {len(new_evidence)} evidence items, "
219
+ f"including {helpful} useful ones."
220
+ )
221
+ return -0.01 if len(new_evidence) == 0 else 0.01, (
222
+ f"Queried {system_name} for case {case.case_id}; found {len(new_evidence)} evidence items."
223
+ )
224
+
225
+ def _retrieve_policy(self, case: InternalCase) -> tuple[float, str]:
226
+ progress = self._progress_by_case[case.case_id]
227
+ if progress.policy_retrieved:
228
+ return -0.01, f"Policy already retrieved for case {case.case_id}."
229
+ progress.policy_retrieved = True
230
+ return 0.05, f"Retrieved policy guidance for case {case.case_id}."
231
+
232
+ def _add_evidence(
233
+ self,
234
+ case: InternalCase,
235
+ evidence_ids: list[str],
236
+ ) -> tuple[float, str]:
237
+ if not evidence_ids:
238
+ raise ValueError("add_evidence requires at least one evidence id.")
239
+ progress = self._progress_by_case[case.case_id]
240
+ all_evidence = self._evidence_map(case)
241
+ reward = 0.0
242
+ added = []
243
+ for evidence_id in evidence_ids:
244
+ if evidence_id not in progress.retrieved_evidence_ids:
245
+ reward -= 0.04
246
+ continue
247
+ if evidence_id in progress.attached_evidence_ids:
248
+ reward -= 0.02
249
+ continue
250
+ progress.attached_evidence_ids.append(evidence_id)
251
+ added.append(evidence_id)
252
+ evidence = all_evidence[evidence_id]
253
+ if evidence.helpful:
254
+ reward += 0.08
255
+ elif evidence.harmful:
256
+ reward -= 0.08
257
+ else:
258
+ reward += 0.01
259
+ return reward if added else -0.05, (
260
+ f"Attached evidence {', '.join(added)} to case {case.case_id}."
261
+ if added
262
+ else f"No evidence was attached to case {case.case_id}."
263
+ )
264
+
265
+ def _remove_evidence(
266
+ self,
267
+ case: InternalCase,
268
+ evidence_ids: list[str],
269
+ ) -> tuple[float, str]:
270
+ if not evidence_ids:
271
+ raise ValueError("remove_evidence requires at least one evidence id.")
272
+ progress = self._progress_by_case[case.case_id]
273
+ evidence_map = self._evidence_map(case)
274
+ reward = 0.0
275
+ removed = []
276
+ for evidence_id in evidence_ids:
277
+ if evidence_id not in progress.attached_evidence_ids:
278
+ reward -= 0.02
279
+ continue
280
+ progress.attached_evidence_ids.remove(evidence_id)
281
+ removed.append(evidence_id)
282
+ if evidence_map[evidence_id].harmful:
283
+ reward += 0.05
284
+ elif evidence_map[evidence_id].helpful:
285
+ reward -= 0.03
286
+ else:
287
+ reward += 0.01
288
+ return reward if removed else -0.04, (
289
+ f"Removed evidence {', '.join(removed)} from case {case.case_id}."
290
+ if removed
291
+ else f"No evidence was removed from case {case.case_id}."
292
+ )
293
+
294
+ def _set_strategy(
295
+ self,
296
+ case: InternalCase,
297
+ strategy: str | None,
298
+ ) -> tuple[float, str]:
299
+ if strategy is None:
300
+ raise ValueError("set_strategy requires strategy.")
301
+ progress = self._progress_by_case[case.case_id]
302
+ progress.current_strategy = strategy
303
+ if strategy == case.optimal_strategy:
304
+ return 0.1, f"Set the optimal strategy '{strategy}' for case {case.case_id}."
305
+ if strategy in case.acceptable_strategies:
306
+ return 0.03, f"Set an acceptable fallback strategy '{strategy}' for case {case.case_id}."
307
+ return -0.08, f"Set a weak strategy '{strategy}' for case {case.case_id}."
308
+
309
+ def _submit_representment(self, case: InternalCase) -> tuple[float, str]:
310
+ progress = self._progress_by_case[case.case_id]
311
+ progress.submit_attempts += 1
312
+ if progress.current_strategy != "contest":
313
+ raise ValueError("submit_representment requires current strategy to be 'contest'.")
314
+ if progress.resolution_status != "open":
315
+ return -0.05, f"Case {case.case_id} is already resolved."
316
+
317
+ attached = set(progress.attached_evidence_ids)
318
+ missing = set(case.required_evidence_ids).difference(attached)
319
+ harmful = set(case.harmful_evidence_ids).intersection(attached)
320
+ if self._state.step_count > case.deadline_step:
321
+ progress.final_resolution = "contest"
322
+ progress.resolution_status = "lost_late"
323
+ return -0.2, f"Representment for case {case.case_id} was submitted after the deadline."
324
+ if missing:
325
+ progress.final_resolution = "contest"
326
+ progress.resolution_status = "lost_incomplete"
327
+ return -0.18, (
328
+ f"Representment for case {case.case_id} is incomplete; missing {', '.join(sorted(missing))}."
329
+ )
330
+ if harmful:
331
+ progress.final_resolution = "contest"
332
+ progress.resolution_status = "lost_harmful_evidence"
333
+ return -0.15, (
334
+ f"Representment for case {case.case_id} included harmful evidence {', '.join(sorted(harmful))}."
335
+ )
336
+
337
+ progress.final_resolution = "contest"
338
+ if case.optimal_strategy == "contest":
339
+ progress.resolution_status = "won"
340
+ return 0.2, f"Submitted a strong representment package for case {case.case_id}."
341
+ progress.resolution_status = "lost_contest"
342
+ return -0.12, f"Contested case {case.case_id}, but the case was not supportable."
343
+
344
+ def _resolve_case(
345
+ self,
346
+ case: InternalCase,
347
+ strategy: str | None,
348
+ ) -> tuple[float, str]:
349
+ progress = self._progress_by_case[case.case_id]
350
+ resolution = strategy or progress.current_strategy
351
+ if resolution not in {"accept_chargeback", "issue_refund"}:
352
+ raise ValueError("resolve_case requires strategy accept_chargeback or issue_refund.")
353
+ if progress.resolution_status != "open":
354
+ return -0.04, f"Case {case.case_id} is already resolved."
355
+ progress.final_resolution = resolution
356
+ progress.current_strategy = resolution
357
+ progress.resolution_status = (
358
+ "refunded" if resolution == "issue_refund" else "accepted_chargeback"
359
+ )
360
+ if self._state.step_count > case.deadline_step:
361
+ return -0.15, f"Resolved case {case.case_id} after the response deadline."
362
+ if resolution == case.optimal_strategy:
363
+ return 0.16, f"Resolved case {case.case_id} with the optimal non-contest strategy."
364
+ if resolution in case.acceptable_strategies:
365
+ return 0.06, f"Resolved case {case.case_id} with an acceptable fallback strategy."
366
+ return -0.12, f"Resolved case {case.case_id} with the wrong strategy."
367
+
368
+ def _apply_deadline_penalties(self) -> float:
369
+ penalty = 0.0
370
+ for case in self._task.cases:
371
+ progress = self._progress_by_case[case.case_id]
372
+ if progress.resolution_status == "open" and self._state.step_count > case.deadline_step:
373
+ if not progress.deadline_penalized:
374
+ progress.deadline_penalized = True
375
+ penalty -= 0.15
376
+ return penalty
377
+
378
+ def _check_done(self) -> bool:
379
+ if self._all_cases_resolved():
380
+ self._done = True
381
+ elif self._state.step_count >= self._task.max_steps:
382
+ self._done = True
383
+ return self._done
384
+
385
+ def _all_cases_resolved(self) -> bool:
386
+ return all(
387
+ progress.resolution_status != "open"
388
+ for progress in self._progress_by_case.values()
389
+ )
390
+
391
+ def _lookup_case(self, case_id: str) -> InternalCase:
392
+ for case in self._task.cases:
393
+ if case.case_id == case_id:
394
+ return case
395
+ raise ValueError(f"Unknown case_id '{case_id}'.")
396
+
397
+ def _require_case(self, case_id: str | None) -> InternalCase:
398
+ target_case_id = case_id or self._selected_case_id
399
+ if target_case_id is None:
400
+ raise ValueError("Select a case before taking this action.")
401
+ return self._lookup_case(target_case_id)
402
+
403
+ def _evidence_map(self, case: InternalCase):
404
+ return {
405
+ item.evidence_id: item
406
+ for items in case.evidence_by_system.values()
407
+ for item in items
408
+ }
409
+
410
+ def _build_queue(self) -> list[CaseQueueItem]:
411
+ queue = []
412
+ for case in self._task.cases:
413
+ progress = self._progress_by_case[case.case_id]
414
+ queue.append(
415
+ CaseQueueItem(
416
+ case_id=case.case_id,
417
+ amount=case.amount,
418
+ currency=case.currency,
419
+ reason_code=case.reason_code,
420
+ status=progress.resolution_status,
421
+ summary=case.summary,
422
+ deadline_step=case.deadline_step,
423
+ steps_until_deadline=case.deadline_step - self._state.step_count,
424
+ )
425
+ )
426
+ return queue
427
+
428
+ def _build_visible_case(self) -> VisibleCase | None:
429
+ if self._selected_case_id is None:
430
+ return None
431
+ case = self._lookup_case(self._selected_case_id)
432
+ progress = self._progress_by_case[case.case_id]
433
+ evidence_map = self._evidence_map(case)
434
+ retrieved = [
435
+ EvidenceCard(
436
+ evidence_id=evidence_id,
437
+ source_system=evidence_map[evidence_id].source_system,
438
+ title=evidence_map[evidence_id].title,
439
+ summary=evidence_map[evidence_id].summary,
440
+ attached=evidence_id in progress.attached_evidence_ids,
441
+ )
442
+ for evidence_id in sorted(progress.retrieved_evidence_ids)
443
+ ]
444
+ attached = [
445
+ EvidenceCard(
446
+ evidence_id=evidence_id,
447
+ source_system=evidence_map[evidence_id].source_system,
448
+ title=evidence_map[evidence_id].title,
449
+ summary=evidence_map[evidence_id].summary,
450
+ attached=True,
451
+ )
452
+ for evidence_id in progress.attached_evidence_ids
453
+ ]
454
+ policy = None
455
+ if progress.policy_retrieved:
456
+ policy = PolicyView(
457
+ reason_code=case.reason_code,
458
+ guidance=case.policy_guidance,
459
+ required_evidence=list(case.policy_requirements),
460
+ recommended_strategy=case.recommended_strategy,
461
+ )
462
+ return VisibleCase(
463
+ case_id=case.case_id,
464
+ order_id=case.order_id,
465
+ customer_id=case.customer_id,
466
+ amount=case.amount,
467
+ currency=case.currency,
468
+ reason_code=case.reason_code,
469
+ status=progress.resolution_status,
470
+ current_strategy=progress.current_strategy,
471
+ summary=case.summary,
472
+ inspection_notes=case.inspection_notes if progress.inspected else None,
473
+ systems_revealed=sorted(progress.revealed_systems),
474
+ retrieved_evidence=retrieved,
475
+ attached_evidence=attached,
476
+ policy=policy,
477
+ submission_status=progress.resolution_status
478
+ if progress.resolution_status != "open"
479
+ else None,
480
+ )
481
+
482
+ def _build_available_actions(self) -> list[str]:
483
+ if self._done:
484
+ return []
485
+ base = ["select_case"]
486
+ if self._selected_case_id is None:
487
+ return base
488
+ case_progress = self._progress_by_case[self._selected_case_id]
489
+ if case_progress.resolution_status != "open":
490
+ return ["select_case"]
491
+ return base + [
492
+ "inspect_case",
493
+ "query_system",
494
+ "retrieve_policy",
495
+ "add_evidence",
496
+ "remove_evidence",
497
+ "set_strategy",
498
+ "submit_representment",
499
+ "resolve_case",
500
+ ]
501
+
502
+ def _estimated_progress_score(self) -> float:
503
+ report = grade_episode(
504
+ self._task,
505
+ self._progress_by_case,
506
+ self._state.step_count,
507
+ self._state.episode_id or "",
508
+ completed=False,
509
+ )
510
+ return report.normalized_score
511
+
512
+ def _build_observation(
513
+ self,
514
+ reward: float,
515
+ done: bool,
516
+ result: str | None = None,
517
+ ) -> ChargebackOpsObservation:
518
+ progress_score = (
519
+ self._latest_report.normalized_score
520
+ if self._latest_report is not None
521
+ else self._estimated_progress_score()
522
+ )
523
+ self._state.queue_state = [
524
+ CaseResolutionState(
525
+ case_id=case.case_id,
526
+ status=self._progress_by_case[case.case_id].resolution_status,
527
+ current_strategy=self._progress_by_case[case.case_id].current_strategy,
528
+ resolved=self._progress_by_case[case.case_id].resolution_status != "open",
529
+ steps_until_deadline=case.deadline_step - self._state.step_count,
530
+ )
531
+ for case in self._task.cases
532
+ ]
533
+ self._state.action_history = [
534
+ ActionTraceItem(
535
+ step_index=record.step_index,
536
+ action_type=record.action_type,
537
+ case_id=record.case_id,
538
+ outcome=record.outcome,
539
+ reward=record.reward,
540
+ )
541
+ for record in self._action_history
542
+ ]
543
+ self._state.selected_case_id = self._selected_case_id
544
+
545
+ return ChargebackOpsObservation(
546
+ task_id=self._task.task_id,
547
+ task_title=self._task.title,
548
+ difficulty=self._task.difficulty,
549
+ objective=self._task.objective,
550
+ selected_case_id=self._selected_case_id,
551
+ queue=self._build_queue(),
552
+ visible_case=self._build_visible_case(),
553
+ last_action_result=result or self._last_action_result,
554
+ available_actions=self._build_available_actions(),
555
+ steps_remaining=max(0, self._task.max_steps - self._state.step_count),
556
+ progress_score=round(progress_score, 4),
557
+ grader_report=self._latest_report,
558
+ done=done,
559
+ reward=reward,
560
+ metadata={
561
+ "reward_components": {
562
+ "step_reward": reward,
563
+ "progress_score": round(progress_score, 4),
564
+ }
565
+ },
566
+ )
567
+
568
+ @property
569
+ def state(self) -> ChargebackOpsState:
570
+ return self._state
server/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ anthropic>=0.51.0
2
+ openai>=2.30.0
3
+ openenv-core[core]>=0.2.2
4
+ python-dotenv>=1.0.1
simulation.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Internal task definitions and runtime types for ChargebackOps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Literal
7
+
8
+ SystemName = Literal["orders", "payment", "shipping", "support", "refunds", "risk"]
9
+ StrategyName = Literal["contest", "accept_chargeback", "issue_refund"]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class InternalEvidence:
14
+ """Evidence item stored in a synthetic merchant system."""
15
+
16
+ evidence_id: str
17
+ source_system: SystemName
18
+ title: str
19
+ summary: str
20
+ helpful: bool = False
21
+ harmful: bool = False
22
+ required: bool = False
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class InternalCase:
27
+ """Synthetic chargeback case definition."""
28
+
29
+ case_id: str
30
+ order_id: str
31
+ customer_id: str
32
+ amount: float
33
+ currency: str
34
+ reason_code: str
35
+ summary: str
36
+ inspection_notes: str
37
+ deadline_step: int
38
+ optimal_strategy: StrategyName
39
+ acceptable_strategies: tuple[StrategyName, ...]
40
+ policy_guidance: str
41
+ policy_requirements: tuple[str, ...]
42
+ recommended_strategy: StrategyName
43
+ resolution_summary: str
44
+ weight: float
45
+ evidence_by_system: dict[SystemName, tuple[InternalEvidence, ...]]
46
+ required_evidence_ids: tuple[str, ...] = ()
47
+ helpful_evidence_ids: tuple[str, ...] = ()
48
+ harmful_evidence_ids: tuple[str, ...] = ()
49
+
50
+
51
+ @dataclass(frozen=True)
52
+ class TaskScenario:
53
+ """One benchmark task."""
54
+
55
+ task_id: str
56
+ title: str
57
+ difficulty: Literal["easy", "medium", "hard"]
58
+ objective: str
59
+ description: str
60
+ max_steps: int
61
+ cases: tuple[InternalCase, ...]
62
+
63
+
64
+ @dataclass
65
+ class CaseProgress:
66
+ """Mutable runtime state for one case."""
67
+
68
+ inspected: bool = False
69
+ policy_retrieved: bool = False
70
+ revealed_systems: set[SystemName] = field(default_factory=set)
71
+ retrieved_evidence_ids: set[str] = field(default_factory=set)
72
+ attached_evidence_ids: list[str] = field(default_factory=list)
73
+ current_strategy: StrategyName | None = None
74
+ final_resolution: str | None = None
75
+ resolution_status: str = "open"
76
+ duplicate_queries: int = 0
77
+ invalid_actions: int = 0
78
+ submit_attempts: int = 0
79
+ deadline_penalized: bool = False
80
+ notes: list[str] = field(default_factory=list)
81
+
82
+
83
+ @dataclass
84
+ class ActionRecord:
85
+ """Runtime action history."""
86
+
87
+ step_index: int
88
+ action_type: str
89
+ case_id: str | None
90
+ outcome: str
91
+ reward: float
92
+
93
+
94
+ def _ev(
95
+ evidence_id: str,
96
+ source_system: SystemName,
97
+ title: str,
98
+ summary: str,
99
+ *,
100
+ helpful: bool = False,
101
+ harmful: bool = False,
102
+ required: bool = False,
103
+ ) -> InternalEvidence:
104
+ return InternalEvidence(
105
+ evidence_id=evidence_id,
106
+ source_system=source_system,
107
+ title=title,
108
+ summary=summary,
109
+ helpful=helpful,
110
+ harmful=harmful,
111
+ required=required,
112
+ )
113
+
114
+
115
+ TASKS: dict[str, TaskScenario] = {
116
+ "goods_not_received_easy": TaskScenario(
117
+ task_id="goods_not_received_easy",
118
+ title="Delivered But Disputed",
119
+ difficulty="easy",
120
+ objective="Contest a goods-not-received chargeback with the right delivery proof before the deadline.",
121
+ description=(
122
+ "A single e-commerce dispute where carrier confirmation and the order confirmation "
123
+ "are enough to win. The task teaches the standard representment loop."
124
+ ),
125
+ max_steps=10,
126
+ cases=(
127
+ InternalCase(
128
+ case_id="CB-E1",
129
+ order_id="ORD-7410",
130
+ customer_id="CUST-1001",
131
+ amount=129.99,
132
+ currency="USD",
133
+ reason_code="goods_not_received",
134
+ summary="Cardholder claims the package never arrived.",
135
+ inspection_notes=(
136
+ "Order shipped the same day. Merchant policy requires carrier proof plus the original order confirmation "
137
+ "for goods-not-received disputes."
138
+ ),
139
+ deadline_step=8,
140
+ optimal_strategy="contest",
141
+ acceptable_strategies=(),
142
+ policy_guidance=(
143
+ "For goods-not-received disputes, prove the merchandise was fulfilled to the billed customer with "
144
+ "order confirmation and carrier delivery evidence."
145
+ ),
146
+ policy_requirements=("order confirmation", "carrier delivery confirmation"),
147
+ recommended_strategy="contest",
148
+ resolution_summary="Strong delivery proof exists. Contesting should recover the funds.",
149
+ weight=1.0,
150
+ required_evidence_ids=("E1-ORDER-CONF", "E1-DELIVERY-SCAN"),
151
+ helpful_evidence_ids=("E1-ORDER-CONF", "E1-DELIVERY-SCAN", "E1-SUPPORT-ACK"),
152
+ harmful_evidence_ids=(),
153
+ evidence_by_system={
154
+ "orders": (
155
+ _ev(
156
+ "E1-ORDER-CONF",
157
+ "orders",
158
+ "Order confirmation",
159
+ "Order confirmation email and checkout receipt showing the billed customer, shipping address, and SKU.",
160
+ helpful=True,
161
+ required=True,
162
+ ),
163
+ ),
164
+ "payment": (
165
+ _ev(
166
+ "E1-AUTH",
167
+ "payment",
168
+ "Authorization record",
169
+ "Authorization approved and captured successfully.",
170
+ ),
171
+ ),
172
+ "shipping": (
173
+ _ev(
174
+ "E1-DELIVERY-SCAN",
175
+ "shipping",
176
+ "Carrier delivery scan",
177
+ "Carrier tracking shows delivered to the customer address two days after shipment.",
178
+ helpful=True,
179
+ required=True,
180
+ ),
181
+ _ev(
182
+ "E1-SIGNATURE",
183
+ "shipping",
184
+ "Doorstep photo confirmation",
185
+ "Carrier stored a package photo at the delivery location.",
186
+ helpful=True,
187
+ ),
188
+ ),
189
+ "support": (
190
+ _ev(
191
+ "E1-SUPPORT-ACK",
192
+ "support",
193
+ "Support ticket acknowledgement",
194
+ "Customer contacted support to ask if the package was left at the front desk after delivery.",
195
+ helpful=True,
196
+ ),
197
+ ),
198
+ "refunds": (
199
+ _ev(
200
+ "E1-NO-REFUND",
201
+ "refunds",
202
+ "Refund ledger",
203
+ "No refund or goodwill credit was issued before the dispute opened.",
204
+ ),
205
+ ),
206
+ "risk": (
207
+ _ev(
208
+ "E1-RISK",
209
+ "risk",
210
+ "Risk summary",
211
+ "Low-risk order with no fraud flags.",
212
+ ),
213
+ ),
214
+ },
215
+ ),
216
+ ),
217
+ ),
218
+ "fraud_signal_ambiguity": TaskScenario(
219
+ task_id="fraud_signal_ambiguity",
220
+ title="Fraud Signal Ambiguity",
221
+ difficulty="medium",
222
+ objective="Choose whether to contest a CNP fraud dispute and curate only the evidence that helps.",
223
+ description=(
224
+ "A card-not-present fraud dispute with mixed signals. Strong account-linkage evidence exists, "
225
+ "but payment mismatch artifacts will hurt the case if attached."
226
+ ),
227
+ max_steps=12,
228
+ cases=(
229
+ InternalCase(
230
+ case_id="CB-M1",
231
+ order_id="ORD-8821",
232
+ customer_id="CUST-2048",
233
+ amount=480.0,
234
+ currency="USD",
235
+ reason_code="fraud_cnp",
236
+ summary="Issuer filed a card-not-present fraud dispute on a high-value electronics order.",
237
+ inspection_notes=(
238
+ "The order used a known account and device, but AVS/CVV mismatches were present. "
239
+ "Winning requires emphasizing customer-account linkage and avoiding mismatch artifacts."
240
+ ),
241
+ deadline_step=9,
242
+ optimal_strategy="contest",
243
+ acceptable_strategies=("accept_chargeback",),
244
+ policy_guidance=(
245
+ "For CNP fraud disputes, contest only when you can link the cardholder to the account or device history. "
246
+ "Do not attach evidence that strengthens the issuer's fraud narrative."
247
+ ),
248
+ policy_requirements=("prior good order linkage", "customer account confirmation"),
249
+ recommended_strategy="contest",
250
+ resolution_summary="Contest only with strong account-linkage evidence. Conceding is acceptable but suboptimal.",
251
+ weight=1.1,
252
+ required_evidence_ids=("M1-PRIOR-ORDERS", "M1-ACCOUNT-CHAT"),
253
+ helpful_evidence_ids=("M1-PRIOR-ORDERS", "M1-ACCOUNT-CHAT", "M1-DELIVERY"),
254
+ harmful_evidence_ids=("M1-AVS-MISMATCH", "M1-CVV-MISMATCH"),
255
+ evidence_by_system={
256
+ "orders": (
257
+ _ev(
258
+ "M1-ORDER",
259
+ "orders",
260
+ "Order receipt",
261
+ "Checkout receipt showing customer account id, shipping address, and same email as prior purchases.",
262
+ helpful=True,
263
+ ),
264
+ ),
265
+ "payment": (
266
+ _ev(
267
+ "M1-AVS-MISMATCH",
268
+ "payment",
269
+ "AVS mismatch detail",
270
+ "Street-number mismatch was recorded at authorization time.",
271
+ harmful=True,
272
+ ),
273
+ _ev(
274
+ "M1-CVV-MISMATCH",
275
+ "payment",
276
+ "CVV mismatch detail",
277
+ "CVV did not fully match at authorization time.",
278
+ harmful=True,
279
+ ),
280
+ _ev(
281
+ "M1-AUTH",
282
+ "payment",
283
+ "Authorization capture",
284
+ "Payment was successfully authorized and captured.",
285
+ ),
286
+ ),
287
+ "shipping": (
288
+ _ev(
289
+ "M1-DELIVERY",
290
+ "shipping",
291
+ "Carrier delivery confirmation",
292
+ "Package was delivered to the saved customer address two days later.",
293
+ helpful=True,
294
+ ),
295
+ ),
296
+ "support": (
297
+ _ev(
298
+ "M1-ACCOUNT-CHAT",
299
+ "support",
300
+ "Authenticated support chat",
301
+ "Customer logged into the account and confirmed the delivery window in chat before shipment.",
302
+ helpful=True,
303
+ required=True,
304
+ ),
305
+ ),
306
+ "refunds": (
307
+ _ev(
308
+ "M1-NO-REFUND",
309
+ "refunds",
310
+ "Refund ledger",
311
+ "No refund or cancellation was issued prior to the dispute.",
312
+ ),
313
+ ),
314
+ "risk": (
315
+ _ev(
316
+ "M1-PRIOR-ORDERS",
317
+ "risk",
318
+ "Prior account activity",
319
+ "Same account, same device fingerprint, and three prior fulfilled orders without disputes.",
320
+ helpful=True,
321
+ required=True,
322
+ ),
323
+ _ev(
324
+ "M1-VELOCITY",
325
+ "risk",
326
+ "Velocity check",
327
+ "No abnormal velocity or proxy usage detected.",
328
+ helpful=True,
329
+ ),
330
+ ),
331
+ },
332
+ ),
333
+ ),
334
+ ),
335
+ "queue_optimization_hard": TaskScenario(
336
+ task_id="queue_optimization_hard",
337
+ title="Dispute Queue Optimization",
338
+ difficulty="hard",
339
+ objective="Maximize recovery across a queue of disputes while respecting deadlines and avoiding weak contests.",
340
+ description=(
341
+ "A real operations queue with three disputes. Two should be actioned quickly, and one should be conceded. "
342
+ "The step budget leaves little room for waste."
343
+ ),
344
+ max_steps=18,
345
+ cases=(
346
+ InternalCase(
347
+ case_id="CB-H1",
348
+ order_id="ORD-9901",
349
+ customer_id="CUST-4100",
350
+ amount=860.0,
351
+ currency="USD",
352
+ reason_code="goods_not_received",
353
+ summary="High-value furniture delivery disputed as not received.",
354
+ inspection_notes=(
355
+ "Carrier stored both a delivery scan and signature. This is the highest-value recoverable case in the queue."
356
+ ),
357
+ deadline_step=7,
358
+ optimal_strategy="contest",
359
+ acceptable_strategies=(),
360
+ policy_guidance=(
361
+ "Use merchant receipt plus carrier proof for goods-not-received disputes. This case is strong if contested on time."
362
+ ),
363
+ policy_requirements=("order confirmation", "signature-backed delivery proof"),
364
+ recommended_strategy="contest",
365
+ resolution_summary="Contest immediately with the signature-backed delivery packet.",
366
+ weight=1.7,
367
+ required_evidence_ids=("H1-ORDER-CONF", "H1-SIGNATURE"),
368
+ helpful_evidence_ids=("H1-ORDER-CONF", "H1-SIGNATURE", "H1-DELIVERY-SCAN"),
369
+ harmful_evidence_ids=(),
370
+ evidence_by_system={
371
+ "orders": (
372
+ _ev(
373
+ "H1-ORDER-CONF",
374
+ "orders",
375
+ "Order invoice",
376
+ "Signed furniture order invoice with billing and delivery address.",
377
+ helpful=True,
378
+ required=True,
379
+ ),
380
+ ),
381
+ "payment": (
382
+ _ev(
383
+ "H1-AUTH",
384
+ "payment",
385
+ "Captured payment",
386
+ "Payment authorization and capture both succeeded.",
387
+ ),
388
+ ),
389
+ "shipping": (
390
+ _ev(
391
+ "H1-SIGNATURE",
392
+ "shipping",
393
+ "Delivery signature",
394
+ "Carrier recorded a recipient signature at the shipping address.",
395
+ helpful=True,
396
+ required=True,
397
+ ),
398
+ _ev(
399
+ "H1-DELIVERY-SCAN",
400
+ "shipping",
401
+ "Final-mile delivery scan",
402
+ "Tracking confirms delivery within the promised window.",
403
+ helpful=True,
404
+ ),
405
+ ),
406
+ "support": (
407
+ _ev(
408
+ "H1-SUPPORT",
409
+ "support",
410
+ "Support history",
411
+ "No delivery complaint was opened before the dispute.",
412
+ ),
413
+ ),
414
+ "refunds": (
415
+ _ev(
416
+ "H1-NO-REFUND",
417
+ "refunds",
418
+ "Refund ledger",
419
+ "No refund was issued.",
420
+ ),
421
+ ),
422
+ "risk": (
423
+ _ev(
424
+ "H1-RISK",
425
+ "risk",
426
+ "Risk summary",
427
+ "Low-risk order. No notable fraud flags.",
428
+ ),
429
+ ),
430
+ },
431
+ ),
432
+ InternalCase(
433
+ case_id="CB-H2",
434
+ order_id="ORD-9902",
435
+ customer_id="CUST-4101",
436
+ amount=240.0,
437
+ currency="USD",
438
+ reason_code="fraud_cnp",
439
+ summary="Apparel order disputed as unauthorized.",
440
+ inspection_notes=(
441
+ "The account is new, there is no durable linkage to the cardholder, and the payment record contains mismatch artifacts. "
442
+ "This case should be conceded."
443
+ ),
444
+ deadline_step=14,
445
+ optimal_strategy="accept_chargeback",
446
+ acceptable_strategies=("issue_refund",),
447
+ policy_guidance=(
448
+ "Do not contest when you lack durable account or device linkage. Avoid wasting steps on weak fraud disputes."
449
+ ),
450
+ policy_requirements=("cardholder linkage evidence"),
451
+ recommended_strategy="accept_chargeback",
452
+ resolution_summary="Concede the dispute. Contesting wastes portfolio value.",
453
+ weight=0.8,
454
+ required_evidence_ids=(),
455
+ helpful_evidence_ids=(),
456
+ harmful_evidence_ids=("H2-AVS", "H2-CVV"),
457
+ evidence_by_system={
458
+ "orders": (
459
+ _ev(
460
+ "H2-ORDER",
461
+ "orders",
462
+ "Order receipt",
463
+ "Guest checkout with a new shipping address and no prior order history.",
464
+ ),
465
+ ),
466
+ "payment": (
467
+ _ev(
468
+ "H2-AVS",
469
+ "payment",
470
+ "AVS mismatch detail",
471
+ "Street and postal code mismatches were present.",
472
+ harmful=True,
473
+ ),
474
+ _ev(
475
+ "H2-CVV",
476
+ "payment",
477
+ "CVV mismatch detail",
478
+ "CVV did not match.",
479
+ harmful=True,
480
+ ),
481
+ ),
482
+ "shipping": (
483
+ _ev(
484
+ "H2-DELIVERY",
485
+ "shipping",
486
+ "Carrier delivery confirmation",
487
+ "Delivered to a new address without signature.",
488
+ ),
489
+ ),
490
+ "support": (
491
+ _ev(
492
+ "H2-SUPPORT",
493
+ "support",
494
+ "Support log",
495
+ "No authenticated support interactions were recorded.",
496
+ ),
497
+ ),
498
+ "refunds": (
499
+ _ev(
500
+ "H2-NO-REFUND",
501
+ "refunds",
502
+ "Refund ledger",
503
+ "No refund issued before the chargeback.",
504
+ ),
505
+ ),
506
+ "risk": (
507
+ _ev(
508
+ "H2-RISK",
509
+ "risk",
510
+ "Risk summary",
511
+ "Elevated risk score and no positive account history.",
512
+ ),
513
+ ),
514
+ },
515
+ ),
516
+ InternalCase(
517
+ case_id="CB-H3",
518
+ order_id="ORD-9903",
519
+ customer_id="CUST-4102",
520
+ amount=320.0,
521
+ currency="USD",
522
+ reason_code="credit_not_processed",
523
+ summary="Subscriber canceled before renewal and says the credit was never processed.",
524
+ inspection_notes=(
525
+ "The merchant missed the promised refund SLA. This should be resolved fast with a refund, not a contest."
526
+ ),
527
+ deadline_step=4,
528
+ optimal_strategy="issue_refund",
529
+ acceptable_strategies=("accept_chargeback",),
530
+ policy_guidance=(
531
+ "If the merchant failed to process a promised credit, refund immediately or concede. Contesting is not supportable."
532
+ ),
533
+ policy_requirements=("proof of cancellation request", "refund status check"),
534
+ recommended_strategy="issue_refund",
535
+ resolution_summary="Refund immediately. Delay turns a manageable loss into a deadline miss.",
536
+ weight=1.2,
537
+ required_evidence_ids=(),
538
+ helpful_evidence_ids=("H3-CANCEL", "H3-NO-REFUND"),
539
+ harmful_evidence_ids=(),
540
+ evidence_by_system={
541
+ "orders": (
542
+ _ev(
543
+ "H3-ORDER",
544
+ "orders",
545
+ "Renewal invoice",
546
+ "Subscription renewed automatically for the annual plan.",
547
+ ),
548
+ ),
549
+ "payment": (
550
+ _ev(
551
+ "H3-PAYMENT",
552
+ "payment",
553
+ "Captured renewal payment",
554
+ "Renewal payment settled successfully.",
555
+ ),
556
+ ),
557
+ "shipping": (),
558
+ "support": (
559
+ _ev(
560
+ "H3-CANCEL",
561
+ "support",
562
+ "Cancellation request",
563
+ "Customer requested cancellation before renewal and support promised a refund within five business days.",
564
+ helpful=True,
565
+ ),
566
+ ),
567
+ "refunds": (
568
+ _ev(
569
+ "H3-NO-REFUND",
570
+ "refunds",
571
+ "Refund ledger",
572
+ "No refund has been issued as of the dispute open date.",
573
+ helpful=True,
574
+ ),
575
+ ),
576
+ "risk": (),
577
+ },
578
+ ),
579
+ ),
580
+ ),
581
+ }
582
+
583
+
584
+ def get_task(task_id: str) -> TaskScenario:
585
+ """Look up a task or raise KeyError."""
586
+
587
+ return TASKS[task_id]
588
+
589
+
590
+ def list_tasks() -> list[TaskScenario]:
591
+ """Return tasks in a stable order."""
592
+
593
+ ordered_ids = [
594
+ "goods_not_received_easy",
595
+ "fraud_signal_ambiguity",
596
+ "queue_optimization_hard",
597
+ ]
598
+ return [TASKS[task_id] for task_id in ordered_ids]
uv.lock ADDED
The diff for this file is too large to render. See raw diff