File size: 30,743 Bytes
486044c
 
 
 
 
 
 
 
 
8b10144
 
486044c
 
8b10144
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
486044c
 
8b10144
486044c
8b10144
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
486044c
 
8b10144
486044c
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
8b10144
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
486044c
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
8b10144
486044c
 
8b10144
486044c
 
 
 
8b10144
486044c
 
 
 
 
8b10144
 
486044c
 
 
 
 
 
 
 
 
 
 
8b10144
486044c
 
 
 
 
 
 
 
 
 
8b10144
 
486044c
 
8b10144
486044c
 
 
 
 
 
 
 
 
8b10144
486044c
8b10144
486044c
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
8b10144
486044c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b10144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486044c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

"""
Comprehensive tests for the API Integration Debugging Environment.

Tests cover:
- Environment reset and initialization
- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
- Multi-dimensional grading rubric
- Fix validation (strict value matching + partial credit)
- Episode termination conditions
- Repeated inspection penalty
- Seed-based reproducibility and issue pool selection
- Dynamic state: service health, cascading failures, dynamic logs
- Strategy scoring
"""

import sys
import os
import pytest

# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from models import ApiDebugAction, ApiDebugObservation
from server.api_debug_env_environment import ApiDebugEnvironment
from scenarios import get_scenario, get_all_task_ids, Issue


# ─── Scenario Tests ──────────────────────────────────────────────────────────


class TestScenarios:
    """Test scenario loading and configuration."""

    def test_all_task_ids_returns_three(self):
        task_ids = get_all_task_ids()
        assert task_ids == ["easy", "medium", "hard"]

    @pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
    def test_scenario_loads(self, task_id):
        scenario = get_scenario(task_id)
        assert scenario.task_id == task_id
        assert len(scenario.issues) > 0
        assert len(scenario.services) > 0
        assert scenario.max_steps > 0

    def test_invalid_task_id_raises(self):
        with pytest.raises(ValueError, match="Unknown task_id"):
            get_scenario("nonexistent")

    def test_easy_has_two_issues(self):
        s = get_scenario("easy")
        assert len(s.issues) == 2

    def test_medium_has_three_issues(self):
        s = get_scenario("medium")
        assert len(s.issues) == 3

    def test_hard_has_five_issues(self):
        s = get_scenario("hard")
        assert len(s.issues) == 5

    def test_seed_randomization_reproducible(self):
        """Same seed should produce same scenario."""
        s1 = get_scenario("easy", seed=42)
        s2 = get_scenario("easy", seed=42)
        assert [i.issue_id for i in s1.issues] == [i.issue_id for i in s2.issues]

    def test_different_seeds_may_vary(self):
        """Different seeds should produce potentially different scenarios."""
        s1 = get_scenario("easy", seed=42)
        s2 = get_scenario("easy", seed=99)
        # They might differ (pool has 4 issues, selecting 2)
        # At minimum, they should both be valid
        assert len(s1.issues) == 2
        assert len(s2.issues) == 2

    def test_each_issue_has_log_hint(self):
        """Every issue should have a corresponding log hint findable in the logs."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            for issue in s.issues:
                found = False
                for service_logs in s.logs.values():
                    for log_line in service_logs:
                        if issue.log_hint in log_line:
                            found = True
                            break
                    if found:
                        break
                assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"

    def test_service_graph_exists(self):
        """Every scenario should have a service dependency graph."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.service_graph) > 0
            for svc in s.services:
                assert svc in s.service_graph, f"Service {svc} missing from graph in {task_id}"

    def test_dynamic_logs_defined(self):
        """Every scenario should have dynamic logs for at least some issues."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.dynamic_logs) > 0, f"No dynamic logs in {task_id}"

    def test_optimal_fix_order_defined(self):
        """Every scenario should have an optimal fix order."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.optimal_fix_order) > 0

    def test_issues_have_categories(self):
        """Every issue should have a category."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            for issue in s.issues:
                assert issue.category in (
                    "configuration", "authentication", "networking", "protocol"
                ), f"Issue {issue.issue_id} has invalid category: {issue.category}"

    def test_context_provided(self):
        """Every scenario should have context."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            assert len(s.context) > 0


# ─── Environment Reset Tests ─────────────────────────────────────────────────


class TestEnvironmentReset:
    """Test environment initialization and reset."""

    def test_reset_returns_observation(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert isinstance(obs, ApiDebugObservation)

    def test_reset_clears_state(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.issues_found == 0
        assert obs.issues_fixed == 0
        assert obs.done is False
        assert obs.remaining_steps == 15  # easy max_steps

    def test_reset_provides_available_targets(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.available_targets) > 0
        assert "payment_client" in obs.available_targets

    def test_reset_with_different_task(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset(task_id="hard")
        assert obs.issues_total == 5

    def test_initial_reward_is_zero(self):
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.reward == 0.0

    def test_reset_includes_service_status(self):
        """Reset should include service health status."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.service_status) > 0
        assert "payment_client" in obs.service_status

    def test_reset_includes_dependency_graph(self):
        """Reset should include service dependency graph."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.dependency_graph) > 0

    def test_reset_includes_error_trace(self):
        """Reset should include initial error trace."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert len(obs.error_trace) > 0


# ─── Action Handler Tests ────────────────────────────────────────────────────


class TestInspectLogs:
    """Test inspect_logs action."""

    def test_inspect_logs_returns_logs(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert len(obs.logs) > 0

    def test_inspect_logs_finds_issues(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs.issues_found > 0
        assert obs.reward > 0

    def test_repeated_inspect_logs_no_reward(self):
        """Second inspection of same target should give 0 reward (+ step cost)."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs1 = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        obs2 = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs2.reward < obs1.reward

    def test_dynamic_logs_after_fix(self):
        """After fixing an issue, re-inspecting should show new log entries."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        # Fix content-type
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # Re-inspect logs β€” should include dynamic log entries
        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        # Should have the original logs PLUS dynamic logs
        assert any("application/json" in log.lower() or "parsed" in log.lower()
                    for log in obs.logs)


class TestInspectConfig:
    """Test inspect_config action."""

    def test_inspect_config_returns_config(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_config",
            target="payment_client",
        ))
        assert len(obs.config_snapshot) > 0
        assert "headers" in obs.config_snapshot


class TestInspectEndpoint:
    """Test inspect_endpoint action."""

    def test_inspect_endpoint_shows_error(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_endpoint",
            target="payment_client",
        ))
        assert obs.api_response is not None
        assert obs.api_response["status"] == "error"

    def test_inspect_endpoint_shows_success_after_fix(self):
        """After all issues fixed, endpoint should show success."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # Episode is done now, but let's check service status
        # The service health should be updated
        assert env._service_health.get("payment_client") == "healthy"

    def test_inspect_endpoint_shows_category_status_code(self):
        """Endpoint errors should have category-appropriate status codes."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="inspect_endpoint",
            target="payment_client",
        ))
        assert obs.api_response is not None
        # Should have a realistic HTTP status code
        assert obs.api_response["status_code"] in [401, 415, 500, 504]


class TestSubmitFix:
    """Test submit_fix action with value validation and partial credit."""

    def test_correct_fix_accepted(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.issues_fixed > 0
        assert "accepted" in obs.action_result.lower() or "fixed" in obs.action_result.lower()

    def test_wrong_value_rejected(self):
        """Right key but wrong value should be rejected."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "text/xml"},
        ))
        assert obs.issues_fixed == 0

    def test_partial_credit_close_value(self):
        """Right key, close value should get partial credit feedback."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/xml"},
        ))
        # Should get partial credit (same prefix "application/")
        assert obs.reward > -0.05  # Better than full reject

    def test_correct_auth_fix(self):
        """Bearer token fix should work with any valid token."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer my_actual_api_key_123"},
        ))
        assert obs.issues_fixed > 0

    def test_empty_payload_rejected(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={},
        ))
        assert obs.reward < 0

    def test_invalid_target_penalized(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="nonexistent_service",
            fix_payload={"key": "value"},
        ))
        assert obs.reward < 0

    def test_fix_all_issues_completes_episode(self):
        """Fixing all issues should mark episode as done with completion bonus."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.done is True
        assert obs.issues_fixed == 2

    def test_strategy_bonus_for_inspecting_first(self):
        """Should get higher reward when inspecting before fixing."""
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        # Fix directly (no inspection)
        obs1 = env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))

        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        # Inspect first, then fix
        env2.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        obs2 = env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))

        # Fix with prior inspection should give higher reward
        assert obs2.reward > obs1.reward


# ─── Service Health Tests ─────────────────────────────────────────────────────


class TestServiceHealth:
    """Test dynamic service health tracking."""

    def test_initial_health_reflects_issues(self):
        """Services with issues should start as degraded/error."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()
        assert obs.service_status.get("payment_client") in ("error", "degraded")

    def test_health_updates_after_fix(self):
        """Fixing all issues on a service should mark it healthy."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        # payment_client should be healthy after both fixes
        assert env._service_health.get("payment_client") == "healthy"

    def test_error_trace_updates(self):
        """Error trace should shrink as issues are fixed."""
        env = ApiDebugEnvironment(task_id="easy")
        obs1 = env.reset()
        initial_trace_len = len(obs1.error_trace)

        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        trace_after_fix = env._build_error_trace()
        assert len(trace_after_fix) < initial_trace_len


# ─── Grading Tests ────────────────────────────────────────────────────────────


class TestGrading:
    """Test the multi-dimensional grading rubric."""

    def test_grade_no_fixes_is_low(self):
        """Grade with no fixes should be low (but not zero β€” exploration gets some credit)."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        score = env.grade()
        assert 0.0 < score < 0.5  # Gets some credit for exploration and efficiency

    def test_grade_all_fixes_is_high(self):
        """Grade with all fixes should be high."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer valid_token_123"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score = env.grade()
        assert score > 0.6

    def test_grade_strictly_between_0_and_1(self):
        """Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
        for task_id in get_all_task_ids():
            env = ApiDebugEnvironment(task_id=task_id)
            env.reset()
            score = env.grade()
            assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"

    def test_efficiency_bonus(self):
        """Faster solutions with same fix count should score higher efficiency component."""
        # Both inspect then fix (same strategy), but one uses more steps
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        env1.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_fast = env1.grade()

        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        for _ in range(10):
            env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_gateway"))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_slow = env2.grade()

        assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"

    def test_strategy_affects_grade(self):
        """Proper strategy (inspect before fix) should improve grade."""
        # No inspection
        env1 = ApiDebugEnvironment(task_id="easy")
        env1.reset()
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer token"},
        ))
        env1.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_no_inspect = env1.grade()

        # With inspection
        env2 = ApiDebugEnvironment(task_id="easy")
        env2.reset()
        env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env2.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer token"},
        ))
        env2.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score_with_inspect = env2.grade()

        # Both should be decent but strategy should boost the inspecting one
        assert score_with_inspect >= score_no_inspect * 0.9  # At least close

    def test_grade_dimensions_nonzero(self):
        """Each grading dimension should be computable."""
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        score = env.grade()
        assert score > 0.001  # Should have some score from partial fix


# ─── Episode Termination Tests ────────────────────────────────────────────────


class TestEpisodeTermination:
    """Test episode ending conditions."""

    def test_out_of_steps_ends_episode(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        for _ in range(15):
            obs = env.step(ApiDebugAction(
                action_type="inspect_logs",
                target="payment_client",
            ))
        assert obs.done is True
        assert obs.remaining_steps == 0

    def test_invalid_action_type_penalized(self):
        env = ApiDebugEnvironment(task_id="easy")
        env.reset()
        obs = env.step(ApiDebugAction(
            action_type="nonexistent_action",
            target="payment_client",
        ))
        assert obs.reward < 0


# ─── Value Matching Tests ─────────────────────────────────────────────────────


class TestValueMatching:
    """Test the _values_match method directly."""

    def setup_method(self):
        self.env = ApiDebugEnvironment(task_id="easy")

    def test_exact_string_match(self):
        assert self.env._values_match("application/json", "application/json")

    def test_case_insensitive_match(self):
        assert self.env._values_match("Application/JSON", "application/json")

    def test_numeric_exact(self):
        assert self.env._values_match(10, 10)

    def test_numeric_tolerance_tight(self):
        """10% tolerance β€” 10 accepts 10 and 9.5 but not 8."""
        assert self.env._values_match(10, 10)  # Exact
        assert self.env._values_match(10, 9.5)  # Within 10% (5% diff)
        assert not self.env._values_match(10, 8)  # Outside 10% (20% diff)

    def test_boolean_match(self):
        assert self.env._values_match(True, True)
        assert not self.env._values_match(True, False)

    def test_boolean_from_string(self):
        assert self.env._values_match(True, "true")
        assert self.env._values_match(False, "false")

    def test_list_containment(self):
        assert self.env._values_match([429, 500], [429, 500])
        assert self.env._values_match([429, 500], [500, 429, 502])

    def test_bearer_token_pattern(self):
        assert self.env._values_match("Bearer <token>", "Bearer my_secret_key")
        assert not self.env._values_match("Bearer <token>", "Bearer ")  # Empty token

    def test_wrong_value_rejected(self):
        assert not self.env._values_match("application/json", "text/xml")
        assert not self.env._values_match(10, 100)


class TestPartialCredit:
    """Test the _values_close method for partial credit."""

    def setup_method(self):
        self.env = ApiDebugEnvironment(task_id="easy")

    def test_numeric_close(self):
        assert self.env._values_close(10, 7)  # Within 50%
        assert not self.env._values_close(10, 100)

    def test_string_same_prefix(self):
        assert self.env._values_close("application/json", "application/xml")

    def test_check_fix_returns_partial(self):
        """Right key, close value should return 'partial'."""
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"timeout": 7})
        assert result == "partial"

    def test_check_fix_returns_exact(self):
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"timeout": 10})
        assert result == "exact"

    def test_check_fix_returns_none(self):
        issue = Issue(
            issue_id="test",
            service="test_svc",
            description="test",
            expected_fix={"timeout": 10},
            fix_key="timeout",
            log_hint="test",
        )
        result = self.env._check_fix(issue, {"base_url": "http://example.com"})
        assert result == "none"


# ─── Integration Tests ────────────────────────────────────────────────────────


class TestFullEpisode:
    """Test complete episode flows."""

    def test_easy_full_solve(self):
        """Run a complete easy episode from start to finish."""
        env = ApiDebugEnvironment(task_id="easy")
        obs = env.reset()

        obs = env.step(ApiDebugAction(
            action_type="inspect_logs",
            target="payment_client",
        ))
        assert obs.issues_found >= 1

        obs = env.step(ApiDebugAction(
            action_type="inspect_config",
            target="payment_client",
        ))
        assert "headers" in obs.config_snapshot

        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Authorization": "Bearer my_token_123"},
        ))
        assert obs.issues_fixed >= 1

        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="payment_client",
            fix_payload={"headers.Content-Type": "application/json"},
        ))
        assert obs.issues_fixed == 2
        assert obs.done is True

        score = env.grade()
        assert score > 0.6

    def test_medium_full_solve(self):
        """Run a complete medium episode."""
        env = ApiDebugEnvironment(task_id="medium")
        obs = env.reset()
        assert obs.issues_total == 3

        # Inspect logs
        for svc in obs.available_targets:
            obs = env.step(ApiDebugAction(
                action_type="inspect_logs", target=svc,
            ))

        # Inspect configs
        obs = env.step(ApiDebugAction(
            action_type="inspect_config", target="webhook_sender",
        ))

        # Fix rate limit
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"rate_limit.requests_per_second": 10},
        ))
        assert obs.issues_fixed >= 1

        # Fix retry
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"retry": {"max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500]}},
        ))

        # Fix signature
        obs = env.step(ApiDebugAction(
            action_type="submit_fix",
            target="webhook_sender",
            fix_payload={"headers.X-Webhook-Signature": "sha256=computed_hmac"},
        ))

        assert obs.done is True
        score = env.grade()
        assert score > 0.4

    def test_hard_partial_solve(self):
        """Partially solve hard task and verify partial credit in grading."""
        env = ApiDebugEnvironment(task_id="hard")
        obs = env.reset()
        assert obs.issues_total == 5

        # Fix just 2 issues
        env.step(ApiDebugAction(action_type="inspect_logs", target="order_service"))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="order_service",
            fix_payload={"inventory_url": "https://inventory.internal/v2/reserve"},
        ))
        env.step(ApiDebugAction(
            action_type="submit_fix",
            target="order_service",
            fix_payload={"timeout": 10},
        ))

        score = env.grade()
        assert 0.0 < score < 0.999
        assert len(env._issues_fixed) == 2


class TestCascadingFailures:
    """Test cascading failure dynamics."""

    def test_hard_dependency_chain(self):
        """Hard scenario has dependent issues (timeout depends on wrong_url)."""
        s = get_scenario("hard")
        timeout_issue = next(i for i in s.issues if i.issue_id == "hard_timeout")
        assert "hard_wrong_url" in timeout_issue.depends_on

    def test_cascade_effects_defined(self):
        """Issues with cascade effects should specify affected services."""
        for task_id in get_all_task_ids():
            s = get_scenario(task_id)
            any_cascade = any(len(i.cascade_effects) > 0 for i in s.issues)
            assert any_cascade, f"No cascade effects in {task_id}"


if __name__ == "__main__":
    pytest.main([__file__, "-v"])