Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import re | |
| import json | |
| import yaml | |
| import unittest | |
| from unittest.mock import MagicMock, patch | |
| import io | |
| from contextlib import redirect_stdout | |
| # Set dummy env vars BEFORE importing inference.py to avoid KeyError | |
| os.environ["API_BASE_URL"] = "http://localhost:8000" | |
| os.environ["MODEL_NAME"] = "test-model" | |
| os.environ["HF_TOKEN"] = "dummy-token" | |
| os.environ["MAX_STEPS"] = "2" | |
| os.environ["TASK_ID"] = "anomaly_detection_easy" | |
| # Add current directory to path so we can import our modules | |
| sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) | |
| import inference | |
| from server.fin_auditor_environment import FinAuditorEnvironment | |
| from models import AuditorAction | |
| class FinalIntegrityCheck(unittest.TestCase): | |
| def test_1_llm_parser_robustness(self): | |
| """Test the regex and JSON fallback logic in inference.py""" | |
| print("\n[TEST 1] LLM Parser Robustness...") | |
| # Test Case A: Markdown wrapped JSON | |
| dirty_json = "Here is the result:\n```json\n{\"decisions\": [0, 1, 1]}\n```\nHope this helps!" | |
| res = inference._parse_llm_decisions(dirty_json, 3) | |
| self.assertEqual(res, [0, 1, 1], "Failed to parse markdown-wrapped JSON") | |
| # Test Case B: Extra text before JSON | |
| extra_text = "The decisions are as follows: {\"decisions\": [1, 1]}" | |
| res = inference._parse_llm_decisions(extra_text, 2) | |
| self.assertEqual(res, [1, 1], "Failed to parse JSON with leading text") | |
| # Test Case C: Malformed JSON -> should trigger 'Flag All' (1) fallback | |
| malformed = "{\"decisions\": [0, 1, " # Missing closing bracket | |
| res = inference._parse_llm_decisions(malformed, 4) | |
| self.assertEqual(res, [1, 1, 1, 1], "Failed to trigger fallback on malformed JSON") | |
| # Test Case D: Correct length normalization | |
| wrong_len = "{\"decisions\": [1]}" | |
| res = inference._parse_llm_decisions(wrong_len, 3) | |
| self.assertEqual(len(res), 3, "Failed to normalize decision list length") | |
| self.assertEqual(res, [1, 1, 1], "Failed to pad short decision list with 1s") | |
| print("✓ LLM Parser logic is robust.") | |
| def test_2_spec_matching(self): | |
| """Verify openenv.yaml matches our deployment and task requirements""" | |
| print("\n[TEST 2] Spec Matching (openenv.yaml)...") | |
| with open("openenv.yaml", "r") as f: | |
| spec = yaml.safe_load(f) | |
| self.assertEqual(spec.get("app"), "server.app:app", "App entry point mismatch") | |
| self.assertEqual(spec.get("port"), 7860, "Port mismatch - HF requires 7860") | |
| tasks = spec.get("tasks", []) | |
| self.assertGreaterEqual(len(tasks), 3, "Missing required tasks (Easy, Medium, Hard)") | |
| task_ids = [t["id"] for t in tasks] | |
| self.assertIn("anomaly_detection_easy", task_ids) | |
| self.assertIn("anomaly_detection_medium", task_ids) | |
| self.assertIn("anomaly_detection_hard", task_ids) | |
| print(f"✓ Spec matches. Found {len(tasks)} tasks.") | |
| def test_3_reward_boundary(self): | |
| """Verify environment rewards stay strictly within (0.0, 1.0)""" | |
| print("\n[TEST 3] Reward Boundary Check...") | |
| env = FinAuditorEnvironment() | |
| obs = env.reset() | |
| # Reset should return features now (not empty) | |
| self.assertGreater(len(obs.features), 0, "Reset should return features for step 1") | |
| # Simulate a step with some decisions | |
| action = AuditorAction(decisions=[1] * len(obs.features)) | |
| new_obs = env.step(action) | |
| reward = new_obs.reward | |
| self.assertIsNotNone(reward) | |
| self.assertGreater(reward, 0.0, f"Reward {reward} must be > 0.0 (not exact boundary)") | |
| self.assertLess(reward, 1.0, f"Reward {reward} must be < 1.0 (not exact boundary)") | |
| print(f"✓ Reward boundary is safe: {reward}") | |
| def test_4_reward_varies_by_action(self): | |
| """Verify rewards differ between optimal and random agents""" | |
| print("\n[TEST 4] Reward Variation Check...") | |
| # Run with all-1 decisions (flag everything) | |
| env1 = FinAuditorEnvironment() | |
| obs1 = env1.reset() | |
| action1 = AuditorAction(decisions=[1] * len(obs1.features)) | |
| result1 = env1.step(action1) | |
| reward1 = result1.reward | |
| # Run with all-0 decisions (pass everything) | |
| env2 = FinAuditorEnvironment() | |
| obs2 = env2.reset() | |
| action2 = AuditorAction(decisions=[0] * len(obs2.features)) | |
| result2 = env2.step(action2) | |
| reward2 = result2.reward | |
| print(f" All-flag reward: {reward1:.4f}") | |
| print(f" All-pass reward: {reward2:.4f}") | |
| # In EASY mode (100% anomalies), flagging everything should score higher | |
| self.assertNotEqual(reward1, reward2, "Rewards must differ between flag-all and pass-all") | |
| print("✓ Rewards vary based on agent decisions.") | |
| def test_5_stdout_format(self): | |
| """Run a 2-step inference and verify stdout matches hackathon regex""" | |
| print("\n[TEST 5] Stdout Format Compliance...") | |
| # Mock the OpenAI client response | |
| mock_response = MagicMock() | |
| mock_response.choices = [MagicMock()] | |
| mock_response.choices[0].message.content = json.dumps({"reasoning": "test", "decisions": [1] * 200}) | |
| with patch("openai.resources.chat.completions.Completions.create", return_value=mock_response): | |
| f = io.StringIO() | |
| with redirect_stdout(f): | |
| inference.main() | |
| output = f.getvalue() | |
| lines = [l for l in output.strip().split("\n") if l.strip()] | |
| # Verify START tag format | |
| start_line = lines[0] | |
| start_match = re.match(r'^\[START\] task=\S+ env=\S+ model=\S+$', start_line) | |
| self.assertIsNotNone(start_match, f"START line doesn't match regex: {start_line}") | |
| self.assertEqual(lines[0], start_line, "The first non-empty line MUST be the [START] tag.") | |
| # Verify STEP tag format | |
| step_lines = [l for l in lines if l.startswith("[STEP]")] | |
| self.assertTrue(len(step_lines) >= 1, "No STEP lines found") | |
| for sl in step_lines: | |
| step_match = re.match( | |
| r'^\[STEP\] step=\d+ action=.*? reward=-?\d+\.\d{2} done=(true|false) error=.*$', | |
| sl | |
| ) | |
| self.assertIsNotNone(step_match, f"STEP line doesn't match regex: {sl}") | |
| # Verify END tag format | |
| end_line = lines[-1] | |
| end_match = re.match( | |
| r'^\[END\] success=(true|false) steps=\d+ score=-?\d+\.\d+ rewards=(?:-?\d+\.\d{2}(?:,-?\d+\.\d{2})*)?$', | |
| end_line | |
| ) | |
| self.assertIsNotNone(end_match, f"END line doesn't match regex: {end_line}") | |
| # Verify NO JSON on stdout | |
| self.assertNotIn("{", output, "Stdout must not contain JSON braces") | |
| self.assertNotIn("}", output, "Stdout must not contain JSON braces") | |
| # Verify STRICT line-type matching (Absolutely NO unauthorized prints) | |
| for l in lines: | |
| self.assertTrue( | |
| l.startswith("[START]") or l.startswith("[STEP]") or l.startswith("[END]"), | |
| f"Unauthorized line detected in STDOUT: {l}" | |
| ) | |
| print("✓ Stdout format is compliant with hackathon regex rules.") | |
| if __name__ == "__main__": | |
| unittest.main(verbosity=1) |