File size: 11,299 Bytes
56acd24
a310a12
56acd24
a310a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56acd24
 
f0023cf
a310a12
f0023cf
 
 
56acd24
a310a12
f0023cf
a310a12
f0023cf
a310a12
3902809
f0023cf
 
 
 
 
 
 
 
 
a310a12
f0023cf
a310a12
 
f0023cf
a310a12
3385186
 
 
 
 
 
 
a310a12
56acd24
54a2655
3385186
a310a12
 
54a2655
a310a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0023cf
56acd24
 
f0023cf
a310a12
56acd24
 
 
af6fa71
f0023cf
a310a12
56acd24
 
af6fa71
f554b94
a310a12
 
f0023cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a310a12
 
 
 
 
 
 
 
 
 
 
 
 
 
f0023cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56acd24
f0023cf
 
a310a12
f0023cf
 
a310a12
f0023cf
56acd24
f0023cf
 
 
a310a12
f0023cf
56acd24
a310a12
56acd24
f0023cf
56acd24
f0023cf
56acd24
e6c33aa
f0023cf
a310a12
f0023cf
 
 
a310a12
6c4cb39
060c9f5
f0023cf
 
 
 
 
a310a12
56acd24
a310a12
 
 
56acd24
 
 
a310a12
56acd24
c591055
a310a12
f0023cf
 
a310a12
6c4cb39
 
 
3902809
 
6c4cb39
 
 
3902809
 
6c4cb39
 
 
3902809
 
 
f0023cf
 
56acd24
f0023cf
 
 
 
c591055
f554b94
f0023cf
56acd24
f0023cf
 
 
a310a12
e6c33aa
 
56acd24
a310a12
f0023cf
56acd24
 
a310a12
 
56acd24
f0023cf
56acd24
f0023cf
a310a12
 
 
 
 
 
 
 
 
 
 
 
 
f0023cf
 
a310a12
c591055
56acd24
a310a12
 
 
 
 
 
7435d89
56acd24
f0023cf
a310a12
f0023cf
a310a12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
"""
Inference Script Example
===================================
MANDATORY
- Before submitting, ensure the following variables are defined in your environment configuration:
    API_BASE_URL   The API endpoint for the LLM.
    MODEL_NAME     The model identifier to use for inference.
    HF_TOKEN       Your Hugging Face / API key.
    LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
                     method

- Defaults are set only for API_BASE_URL and MODEL_NAME 
    (and should reflect your active inference setup):
    API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
    MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
    
- The inference script must be named `inference.py` and placed in the root directory of the project
- Participants must use OpenAI Client for all LLM calls using above variables

STDOUT FORMAT
- The script must emit exactly three line types to stdout, in this order:

    [START] task=<task_name> env=<benchmark> model=<model_name>
    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
    [END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>

  Rules:
    - One [START] line at episode begin.
    - One [STEP] line per step, immediately after env.step() returns.
    - One [END] line after env.close(), always emitted (even on exception).
    - reward and rewards are formatted to 2 decimal places.
    - done and success are lowercase booleans: true or false.
    - error is the raw last_action_error string, or null if none.
    - All fields on a single line with no newlines within a line.
    - Each tasks should return score in [0, 1]

  Example:
    [START] task=click-test env=miniwob model=Qwen3-VL-30B
    [STEP] step=1 action=click('123') reward=0.00 done=false error=null
    [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
    [STEP] step=3 action=click('789') reward=1.00 done=true error=null
    [END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
"""

import os
import textwrap
import json
import re
import time
from typing import List, Optional
from pydantic import BaseModel

from openai import OpenAI

import sys
import contextlib
_ROOT = os.path.dirname(os.path.abspath(__file__))
if _ROOT not in sys.path:
    sys.path.insert(0, _ROOT)

try:
    from hft_auditor_env import FinAuditorEnv as FinAuditorEnvironment
except ImportError:
    from server.fin_auditor_environment import FinAuditorEnvironment
from models import AuditorAction
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")

API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "meta-llama/Meta-Llama-3-8B-Instruct"

TASK_ID = os.getenv("TASK_ID", "anomaly_detection_hard")
if "easy" in TASK_ID.lower():
    _DEFAULT_MAX = 5
elif "medium" in TASK_ID.lower():
    _DEFAULT_MAX = 10
else:
    _DEFAULT_MAX = 20

MAX_STEPS = int(os.getenv("MAX_STEPS", str(_DEFAULT_MAX)))
TEMPERATURE = 0.0
MAX_TOKENS = 3000

SYSTEM_PROMPT = """\
You are a Wall Street Compliance Auditor AI embedded inside a High-Frequency Trading audit engine.
You MUST think step-by-step in the 'reasoning' field before determining your action, but KEEP IT BRIEF (max 2 sentences total).

━━━ DECISION VALUES ━━━
You must output ONLY raw integers (0 or 1) in the array. NO strings. NO labels.
  1 : Confirmed anomaly.
  0 : Trade is clean.

━━━ EVALUATION DIRECTIVE ━━━
1. CRITICAL SIGNAL (FLAG): If risk_score > 0.60, output 1.
2. NOISE SIGNAL (PASS): If risk_score < 0.30, output 0.
3. AMBIGUITY (FLAG): If risk_score is in between, output 1 to be safe.

━━━ CRITICAL JSON FORMAT ━━━
You MUST respond with a valid JSON object. The decisions array MUST contain exactly the requested number of raw integers. NO trailing commas.
Example:
{"reasoning": "Trade 1 has high risk. Trade 2 is safe.", "decisions": [1, 0, 1]}
"""


def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)


def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    print(f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}", flush=True)


def build_user_prompt(step: int, features: list[list[float]]) -> str:
    lines = [
        f"Step {step}: You have {len(features)} flagged trades to audit.",
        "",
        "Trade# | time_elapsed | price_delta | missing_freq | risk_score",
        "-------|--------------|-------------|--------------|----------",
    ]
    for i, row in enumerate(features):
        if len(row) >= 4:
            lines.append(f"  {i+1:3d}  |   {row[0]:8.4f}   |   {row[1]:7.4f}   |   {row[2]:8.4f}   |  {row[3]:7.4f}")
        else:
            lines.append(f"  {i+1:3d}  |   (malformed row: {row})")
    lines.append("")
    lines.append(f"Provide exactly {len(features)} decisions as a JSON object.")
    return "\n".join(lines)


class LLMResponse(BaseModel):
    reasoning: str
    decisions: List[int]

_last_reasoning: str = ""

def _normalize_decisions(decisions: list[int], expected: int) -> list[int]:
    clamped = [1 if d >= 1 else 0 for d in decisions]
    clamped = clamped[:expected]
    while len(clamped) < expected:
        clamped.append(1) 
    return clamped

def _parse_llm_decisions(content: str, expected_count: int) -> list[int]:
    global _last_reasoning
    stripped = content.strip()
    if stripped.startswith("```"):
        stripped = re.sub(r'^```[\w]*\n?', '', stripped)
        stripped = re.sub(r'\n?```$', '', stripped.strip())

    try:
        parsed = json.loads(stripped)
        if isinstance(parsed, dict) and "decisions" in parsed:
            response = LLMResponse(**parsed)
            _last_reasoning = response.reasoning
            return _normalize_decisions([int(d) for d in response.decisions], expected_count)
    except Exception:
        pass

    try:
        parsed = json.loads(stripped)
        if isinstance(parsed, dict) and "decisions" in parsed:
            decisions = [int(d) for d in parsed["decisions"]]
            return _normalize_decisions(decisions, expected_count)
    except Exception:
        pass

    match = re.search(r'\[[\s\d,]+\]', content)
    if match:
        try:
            decisions = json.loads(match.group())
            return _normalize_decisions([int(d) for d in decisions], expected_count)
        except Exception:
            pass

    return [1] * expected_count

def get_model_message(client: OpenAI, step: int, features: list[list[float]]) -> list[int]:
    global _last_reasoning
    _last_reasoning = "Fallback triggered."
    user_prompt = build_user_prompt(step, features)
    max_retries = 3

    for _ in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=TEMPERATURE,
                max_tokens=MAX_TOKENS,
                stream=False,
            )
            content = (completion.choices[0].message.content or "").strip()
            return _parse_llm_decisions(content, len(features))
        except Exception as exc:
            print(f"[DEBUG] Model request failed: {exc}", file=sys.stderr, flush=True)
            time.sleep(1)
            
    fallback_decisions = []
    for row in features:
        if len(row) >= 4:
            # Matches SYSTEM_PROMPT: 1 if > 0.60, 0 if < 0.30, 1 if in between.
            risk_score = row[3]
            fallback_decisions.append(0 if risk_score < 0.30 else 1)
        else:
            fallback_decisions.append(1)
            
    return fallback_decisions


def main() -> None:
    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

    env = FinAuditorEnvironment()

    rewards: List[float] = []
    steps_taken = 0
    score = 0.10
    success = False

    log_start(task=TASK_ID, env="fin_auditor", model=MODEL_NAME)

    try:
        # Determine the correct task configuration dynamically based on TASK_ID
        if "easy" in TASK_ID.lower():
            from tasks.task1_easy import setup_env
            setup_env(env)
            from graders.grader_detection import EasyDetectionGrader
            grader = EasyDetectionGrader()
        elif "medium" in TASK_ID.lower():
            from tasks.task2_medium import setup_env
            setup_env(env)
            from graders.grader_classification import MediumClassificationGrader
            grader = MediumClassificationGrader()
        else:
            from tasks.task3_hard import setup_env
            setup_env(env)
            from graders.grader_fix import HardFixGrader
            grader = HardFixGrader()
            
        obs = env.reset()

        for step in range(1, MAX_STEPS + 1):
            features = obs.features

            if not features:
                action = AuditorAction(decisions=[])
                global _last_reasoning
                _last_reasoning = "Empty matrix."
            else:
                decisions = get_model_message(client, step, features)
                action = AuditorAction(decisions=decisions)

            obs = env.step(action)

            base_reward = float(obs.reward) if obs.reward is not None else 0.1
            reward = float(max(0.01, min(0.99, base_reward)))
            done = obs.done
            error = None

            rewards.append(reward)
            steps_taken = step

            action_str = ",".join(str(d) for d in action.decisions) if action.decisions else "none"
            log_step(step=step, action=action_str, reward=reward, done=done, error=error)

            if done:
                break

        if "easy" in TASK_ID.lower():
            from graders.grader_detection import EasyDetectionGrader
            grader = EasyDetectionGrader()
        elif "medium" in TASK_ID.lower():
            from graders.grader_classification import MediumClassificationGrader
            grader = MediumClassificationGrader()
        else:
            from graders.grader_fix import HardFixGrader
            grader = HardFixGrader()
            
        score = grader.grade(env.state)
        success = True

    except Exception as exc:
        print(f"[DEBUG] Inference failed: {exc}", file=sys.stderr, flush=True)
    finally:
        if not rewards:
            rewards = [0.10]
            score = 0.10
            
        # Ensure absolutely no element is exactly 0.0 or 1.0 or outside the valid range.
        for i in range(len(rewards)):
            rewards[i] = float(max(0.01, min(0.99, rewards[i])))
        
        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)


if __name__ == "__main__":
    main()