File size: 7,622 Bytes
f0023cf
 
 
85228ff
f0023cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85228ff
f0023cf
85228ff
f0023cf
 
85228ff
f0023cf
85228ff
f0023cf
85228ff
f0023cf
 
85228ff
f0023cf
 
 
 
 
85228ff
f0023cf
 
 
 
 
 
 
 
 
 
af6fa71
f0023cf
 
 
 
 
 
 
 
 
 
 
 
85228ff
f0023cf
 
 
 
85228ff
 
 
f0023cf
85228ff
f0023cf
85228ff
f0023cf
 
85228ff
 
f0023cf
 
 
85228ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0023cf
 
 
 
85228ff
f0023cf
a6c300d
f0023cf
 
a6c300d
f0023cf
 
85228ff
 
 
 
 
 
3902809
85228ff
 
 
 
 
 
af6fa71
85228ff
 
 
f0023cf
85228ff
 
 
af6fa71
85228ff
 
 
f0023cf
85228ff
 
 
f0023cf
a4d39b3
 
 
 
 
 
 
85228ff
f0023cf
 
bbf6d3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python3
import os
import sys
import re
import json
import yaml
import unittest
from unittest.mock import MagicMock, patch
import io
from contextlib import redirect_stdout

# Set dummy env vars BEFORE importing inference.py to avoid KeyError
os.environ["API_BASE_URL"] = "http://localhost:8000"
os.environ["MODEL_NAME"] = "test-model"
os.environ["HF_TOKEN"] = "dummy-token"
os.environ["MAX_STEPS"] = "2"
os.environ["TASK_ID"] = "anomaly_detection_easy"

# Add current directory to path so we can import our modules
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

import inference
from server.fin_auditor_environment import FinAuditorEnvironment
from models import AuditorAction

class FinalIntegrityCheck(unittest.TestCase):

    def test_1_llm_parser_robustness(self):
        """Test the regex and JSON fallback logic in inference.py"""
        print("\n[TEST 1] LLM Parser Robustness...")
        
        # Test Case A: Markdown wrapped JSON
        dirty_json = "Here is the result:\n```json\n{\"decisions\": [0, 1, 1]}\n```\nHope this helps!"
        res = inference._parse_llm_decisions(dirty_json, 3)
        self.assertEqual(res, [0, 1, 1], "Failed to parse markdown-wrapped JSON")

        # Test Case B: Extra text before JSON
        extra_text = "The decisions are as follows: {\"decisions\": [1, 1]}"
        res = inference._parse_llm_decisions(extra_text, 2)
        self.assertEqual(res, [1, 1], "Failed to parse JSON with leading text")

        # Test Case C: Malformed JSON -> should trigger 'Flag All' (1) fallback
        malformed = "{\"decisions\": [0, 1, " # Missing closing bracket
        res = inference._parse_llm_decisions(malformed, 4)
        self.assertEqual(res, [1, 1, 1, 1], "Failed to trigger fallback on malformed JSON")
        
        # Test Case D: Correct length normalization
        wrong_len = "{\"decisions\": [1]}"
        res = inference._parse_llm_decisions(wrong_len, 3)
        self.assertEqual(len(res), 3, "Failed to normalize decision list length")
        self.assertEqual(res, [1, 1, 1], "Failed to pad short decision list with 1s")

        print("✓ LLM Parser logic is robust.")

    def test_2_spec_matching(self):
        """Verify openenv.yaml matches our deployment and task requirements"""
        print("\n[TEST 2] Spec Matching (openenv.yaml)...")
        with open("openenv.yaml", "r") as f:
            spec = yaml.safe_load(f)
        
        self.assertEqual(spec.get("app"), "server.app:app", "App entry point mismatch")
        self.assertEqual(spec.get("port"), 7860, "Port mismatch - HF requires 7860")
        
        tasks = spec.get("tasks", [])
        self.assertGreaterEqual(len(tasks), 3, "Missing required tasks (Easy, Medium, Hard)")
        
        task_ids = [t["id"] for t in tasks]
        self.assertIn("anomaly_detection_easy", task_ids)
        self.assertIn("anomaly_detection_medium", task_ids)
        self.assertIn("anomaly_detection_hard", task_ids)
        
        print(f"✓ Spec matches. Found {len(tasks)} tasks.")

    def test_3_reward_boundary(self):
        """Verify environment rewards stay strictly within (0.0, 1.0)"""
        print("\n[TEST 3] Reward Boundary Check...")
        env = FinAuditorEnvironment()
        obs = env.reset()
        
        # Reset should return features now (not empty)
        self.assertGreater(len(obs.features), 0, "Reset should return features for step 1")
        
        # Simulate a step with some decisions
        action = AuditorAction(decisions=[1] * len(obs.features))
        new_obs = env.step(action)

        reward = new_obs.reward
        self.assertIsNotNone(reward)
        self.assertGreater(reward, 0.0, f"Reward {reward} must be > 0.0 (not exact boundary)")
        self.assertLess(reward, 1.0, f"Reward {reward} must be < 1.0 (not exact boundary)")
        
        print(f"✓ Reward boundary is safe: {reward}")

    def test_4_reward_varies_by_action(self):
        """Verify rewards differ between optimal and random agents"""
        print("\n[TEST 4] Reward Variation Check...")
        
        # Run with all-1 decisions (flag everything)
        env1 = FinAuditorEnvironment()
        obs1 = env1.reset()
        action1 = AuditorAction(decisions=[1] * len(obs1.features))
        result1 = env1.step(action1)
        reward1 = result1.reward

        # Run with all-0 decisions (pass everything)
        env2 = FinAuditorEnvironment()
        obs2 = env2.reset()
        action2 = AuditorAction(decisions=[0] * len(obs2.features))
        result2 = env2.step(action2)
        reward2 = result2.reward

        print(f"  All-flag reward: {reward1:.4f}")
        print(f"  All-pass reward: {reward2:.4f}")

        # In EASY mode (100% anomalies), flagging everything should score higher
        self.assertNotEqual(reward1, reward2, "Rewards must differ between flag-all and pass-all")
        
        print("✓ Rewards vary based on agent decisions.")

    def test_5_stdout_format(self):
        """Run a 2-step inference and verify stdout matches hackathon regex"""
        print("\n[TEST 5] Stdout Format Compliance...")
        
        # Mock the OpenAI client response
        mock_response = MagicMock()
        mock_response.choices = [MagicMock()]
        mock_response.choices[0].message.content = json.dumps({"reasoning": "test", "decisions": [1] * 200})
        
        with patch("openai.resources.chat.completions.Completions.create", return_value=mock_response):
            f = io.StringIO()
            with redirect_stdout(f):
                inference.main()
            
            output = f.getvalue()
            lines = [l for l in output.strip().split("\n") if l.strip()]
            
            # Verify START tag format
            start_line = lines[0]
            start_match = re.match(r'^\[START\] task=\S+ env=\S+ model=\S+$', start_line)
            self.assertIsNotNone(start_match, f"START line doesn't match regex: {start_line}")
            self.assertEqual(lines[0], start_line, "The first non-empty line MUST be the [START] tag.")
            
            # Verify STEP tag format
            step_lines = [l for l in lines if l.startswith("[STEP]")]
            self.assertTrue(len(step_lines) >= 1, "No STEP lines found")
            for sl in step_lines:
                step_match = re.match(
                    r'^\[STEP\] step=\d+ action=.*? reward=-?\d+\.\d{2} done=(true|false) error=.*$',
                    sl
                )
                self.assertIsNotNone(step_match, f"STEP line doesn't match regex: {sl}")
            
            # Verify END tag format
            end_line = lines[-1]
            end_match = re.match(
                r'^\[END\] success=(true|false) steps=\d+ score=-?\d+\.\d+ rewards=(?:-?\d+\.\d{2}(?:,-?\d+\.\d{2})*)?$',
                end_line
            )
            self.assertIsNotNone(end_match, f"END line doesn't match regex: {end_line}")
            
            # Verify NO JSON on stdout
            self.assertNotIn("{", output, "Stdout must not contain JSON braces")
            self.assertNotIn("}", output, "Stdout must not contain JSON braces")

            # Verify STRICT line-type matching (Absolutely NO unauthorized prints)
            for l in lines:
                self.assertTrue(
                    l.startswith("[START]") or l.startswith("[STEP]") or l.startswith("[END]"),
                    f"Unauthorized line detected in STDOUT: {l}"
                )

        print("✓ Stdout format is compliant with hackathon regex rules.")

if __name__ == "__main__":
    unittest.main(verbosity=1)