Spaces:

TwoBraincells
/

Elite-Trade-Sentry

Sleeping

App Files Files Community

Elite-Trade-Sentry / final_check.py

TheRealAIGuy

Silence Recon Engine For Sure

af6fa71 30 days ago

raw

history blame contribute delete

7.62 kB

	#!/usr/bin/env python3
	import os
	import sys
	import re
	import json
	import yaml
	import unittest
	from unittest.mock import MagicMock, patch
	import io
	from contextlib import redirect_stdout

	# Set dummy env vars BEFORE importing inference.py to avoid KeyError
	os.environ["API_BASE_URL"] = "http://localhost:8000"
	os.environ["MODEL_NAME"] = "test-model"
	os.environ["HF_TOKEN"] = "dummy-token"
	os.environ["MAX_STEPS"] = "2"
	os.environ["TASK_ID"] = "anomaly_detection_easy"

	# Add current directory to path so we can import our modules
	sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))

	import inference
	from server.fin_auditor_environment import FinAuditorEnvironment
	from models import AuditorAction

	class FinalIntegrityCheck(unittest.TestCase):

	def test_1_llm_parser_robustness(self):
	"""Test the regex and JSON fallback logic in inference.py"""
	print("\n[TEST 1] LLM Parser Robustness...")

	# Test Case A: Markdown wrapped JSON
	dirty_json = "Here is the result:\n```json\n{\"decisions\": [0, 1, 1]}\n```\nHope this helps!"
	res = inference._parse_llm_decisions(dirty_json, 3)
	self.assertEqual(res, [0, 1, 1], "Failed to parse markdown-wrapped JSON")

	# Test Case B: Extra text before JSON
	extra_text = "The decisions are as follows: {\"decisions\": [1, 1]}"
	res = inference._parse_llm_decisions(extra_text, 2)
	self.assertEqual(res, [1, 1], "Failed to parse JSON with leading text")

	# Test Case C: Malformed JSON -> should trigger 'Flag All' (1) fallback
	malformed = "{\"decisions\": [0, 1, " # Missing closing bracket
	res = inference._parse_llm_decisions(malformed, 4)
	self.assertEqual(res, [1, 1, 1, 1], "Failed to trigger fallback on malformed JSON")

	# Test Case D: Correct length normalization
	wrong_len = "{\"decisions\": [1]}"
	res = inference._parse_llm_decisions(wrong_len, 3)
	self.assertEqual(len(res), 3, "Failed to normalize decision list length")
	self.assertEqual(res, [1, 1, 1], "Failed to pad short decision list with 1s")

	print("✓ LLM Parser logic is robust.")

	def test_2_spec_matching(self):
	"""Verify openenv.yaml matches our deployment and task requirements"""
	print("\n[TEST 2] Spec Matching (openenv.yaml)...")
	with open("openenv.yaml", "r") as f:
	spec = yaml.safe_load(f)

	self.assertEqual(spec.get("app"), "server.app:app", "App entry point mismatch")
	self.assertEqual(spec.get("port"), 7860, "Port mismatch - HF requires 7860")

	tasks = spec.get("tasks", [])
	self.assertGreaterEqual(len(tasks), 3, "Missing required tasks (Easy, Medium, Hard)")

	task_ids = [t["id"] for t in tasks]
	self.assertIn("anomaly_detection_easy", task_ids)
	self.assertIn("anomaly_detection_medium", task_ids)
	self.assertIn("anomaly_detection_hard", task_ids)

	print(f"✓ Spec matches. Found {len(tasks)} tasks.")

	def test_3_reward_boundary(self):
	"""Verify environment rewards stay strictly within (0.0, 1.0)"""
	print("\n[TEST 3] Reward Boundary Check...")
	env = FinAuditorEnvironment()
	obs = env.reset()

	# Reset should return features now (not empty)
	self.assertGreater(len(obs.features), 0, "Reset should return features for step 1")

	# Simulate a step with some decisions
	action = AuditorAction(decisions=[1] * len(obs.features))
	new_obs = env.step(action)

	reward = new_obs.reward
	self.assertIsNotNone(reward)
	self.assertGreater(reward, 0.0, f"Reward {reward} must be > 0.0 (not exact boundary)")
	self.assertLess(reward, 1.0, f"Reward {reward} must be < 1.0 (not exact boundary)")

	print(f"✓ Reward boundary is safe: {reward}")

	def test_4_reward_varies_by_action(self):
	"""Verify rewards differ between optimal and random agents"""
	print("\n[TEST 4] Reward Variation Check...")

	# Run with all-1 decisions (flag everything)
	env1 = FinAuditorEnvironment()
	obs1 = env1.reset()
	action1 = AuditorAction(decisions=[1] * len(obs1.features))
	result1 = env1.step(action1)
	reward1 = result1.reward

	# Run with all-0 decisions (pass everything)
	env2 = FinAuditorEnvironment()
	obs2 = env2.reset()
	action2 = AuditorAction(decisions=[0] * len(obs2.features))
	result2 = env2.step(action2)
	reward2 = result2.reward

	print(f" All-flag reward: {reward1:.4f}")
	print(f" All-pass reward: {reward2:.4f}")

	# In EASY mode (100% anomalies), flagging everything should score higher
	self.assertNotEqual(reward1, reward2, "Rewards must differ between flag-all and pass-all")

	print("✓ Rewards vary based on agent decisions.")

	def test_5_stdout_format(self):
	"""Run a 2-step inference and verify stdout matches hackathon regex"""
	print("\n[TEST 5] Stdout Format Compliance...")

	# Mock the OpenAI client response
	mock_response = MagicMock()
	mock_response.choices = [MagicMock()]
	mock_response.choices[0].message.content = json.dumps({"reasoning": "test", "decisions": [1] * 200})

	with patch("openai.resources.chat.completions.Completions.create", return_value=mock_response):
	f = io.StringIO()
	with redirect_stdout(f):
	inference.main()

	output = f.getvalue()
	lines = [l for l in output.strip().split("\n") if l.strip()]

	# Verify START tag format
	start_line = lines[0]
	start_match = re.match(r'^\[START\] task=\S+ env=\S+ model=\S+$', start_line)
	self.assertIsNotNone(start_match, f"START line doesn't match regex: {start_line}")
	self.assertEqual(lines[0], start_line, "The first non-empty line MUST be the [START] tag.")

	# Verify STEP tag format
	step_lines = [l for l in lines if l.startswith("[STEP]")]
	self.assertTrue(len(step_lines) >= 1, "No STEP lines found")
	for sl in step_lines:
	step_match = re.match(
	r'^\[STEP\] step=\d+ action=.? reward=-?\d+\.\d{2} done=(true\|false) error=.$',
	sl
	)
	self.assertIsNotNone(step_match, f"STEP line doesn't match regex: {sl}")

	# Verify END tag format
	end_line = lines[-1]
	end_match = re.match(
	r'^\[END\] success=(true\|false) steps=\d+ score=-?\d+\.\d+ rewards=(?:-?\d+\.\d{2}(?:,-?\d+\.\d{2})*)?$',
	end_line
	)
	self.assertIsNotNone(end_match, f"END line doesn't match regex: {end_line}")

	# Verify NO JSON on stdout
	self.assertNotIn("{", output, "Stdout must not contain JSON braces")
	self.assertNotIn("}", output, "Stdout must not contain JSON braces")

	# Verify STRICT line-type matching (Absolutely NO unauthorized prints)
	for l in lines:
	self.assertTrue(
	l.startswith("[START]") or l.startswith("[STEP]") or l.startswith("[END]"),
	f"Unauthorized line detected in STDOUT: {l}"
	)

	print("✓ Stdout format is compliant with hackathon regex rules.")

	if __name__ == "__main__":
	unittest.main(verbosity=1)