File size: 4,580 Bytes
03d9e7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
State / time / arb-awareness features for ARB-MAX. 15 features, fixed order.
"""

from __future__ import annotations

import math
from typing import List

import numpy as np
import pandas as pd

FEATURE_NAMES: List[str] = [
    "tick_norm",                            # at_tick / 900
    "seconds_remaining",                    # 900 - at_tick
    "log_seconds_remaining",                # log(900 - at_tick + 1)
    "min_combined_cost_so_far",             # min over ticks 0..at_tick of up_ask1+dn_ask1
    "current_best_arb_opportunity",         # min_combined_cost - 1.0
    "leg_side_stub",                        # 0 at feature-extraction time
    "leg_cost_stub",                        # 0
    "leg_age_stub",                         # 0
    "required_hedge_px_for_be_up",          # 1.0 - up_ask_now - fee(up_ask_now)
    "required_hedge_px_for_be_dn",          # 1.0 - dn_ask_now - fee(dn_ask_now)
    "time_since_best_combined_cost_seen",   # ticks since min combined cost
    "has_potential_arb_been_available_yet", # 1 if any min_combined_so_far < 1
    "current_combined_cost",                # up_ask_now + dn_ask_now
    "combined_cost_minus_min",              # current - min so far
    "arb_potential_rank_in_window",         # rank of current combined cost among history
]
assert len(FEATURE_NAMES) == 15


def _fee_shares(p: float, shares: int = 100) -> float:
    if not np.isfinite(p) or p <= 0 or p >= 1:
        return 0.0
    return 0.072 * p * (1.0 - p) * shares


def _col(df: pd.DataFrame, name: str) -> np.ndarray:
    if name in df.columns:
        a = df[name].to_numpy(dtype=np.float64)
    else:
        a = np.full(len(df), np.nan, dtype=np.float64)
    return a


def _ff(a: np.ndarray) -> np.ndarray:
    out = a.copy()
    last = np.nan
    for i, v in enumerate(out):
        if np.isfinite(v):
            last = v
        else:
            out[i] = last
    if not np.isfinite(out[0]):
        first = np.nan
        for v in out:
            if np.isfinite(v):
                first = v
                break
        if np.isfinite(first):
            for i in range(len(out)):
                if np.isfinite(out[i]):
                    break
                out[i] = first
    return np.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0)


def extract(window_frame: pd.DataFrame, at_tick: int = 120) -> np.ndarray:
    df = window_frame.iloc[: at_tick + 1]
    n = len(df)

    up_ask = _ff(_col(df, "pm_up_ask_px_1"))
    dn_ask = _ff(_col(df, "pm_dn_ask_px_1"))
    combined = up_ask + dn_ask

    tick_norm = float(at_tick) / 900.0
    seconds_remaining = float(900 - at_tick)
    log_seconds_remaining = float(math.log(max(seconds_remaining + 1.0, 1.0)))

    if len(combined) > 0:
        min_combined = float(np.nanmin(combined))
        argmin_idx = int(np.nanargmin(combined))
    else:
        min_combined = 0.0
        argmin_idx = 0

    current_best_arb = min_combined - 1.0

    # per spec, these are stubs since we don't carry an open leg during feature extract
    leg_side_stub = 0.0
    leg_cost_stub = 0.0
    leg_age_stub = 0.0

    up_ask_now = float(up_ask[-1]) if len(up_ask) else 0.0
    dn_ask_now = float(dn_ask[-1]) if len(dn_ask) else 0.0

    # hedge-to-breakeven: buy UP at up_ask_now, need DN to cost <= 1 - up_ask_now - total_fee_share
    # total fee per share ~ fee(up) + fee(hedge) / shares. Approximate fee as entry-only for simplicity.
    fee_up = _fee_shares(up_ask_now) / 100.0
    fee_dn = _fee_shares(dn_ask_now) / 100.0
    req_hedge_up = 1.0 - up_ask_now - fee_up
    req_hedge_dn = 1.0 - dn_ask_now - fee_dn

    time_since_best = float(at_tick - argmin_idx)
    has_arb_yet = 1.0 if min_combined < 1.0 else 0.0
    current_combined = float(combined[-1]) if len(combined) else 0.0
    combined_minus_min = current_combined - min_combined

    if len(combined) > 1:
        rank = float((combined <= current_combined).mean())
    else:
        rank = 0.5

    out = np.array(
        [
            tick_norm,
            seconds_remaining,
            log_seconds_remaining,
            min_combined,
            current_best_arb,
            leg_side_stub,
            leg_cost_stub,
            leg_age_stub,
            req_hedge_up,
            req_hedge_dn,
            time_since_best,
            has_arb_yet,
            current_combined,
            combined_minus_min,
            rank,
        ],
        dtype=np.float64,
    )
    out = np.where(np.isfinite(out), out, 0.0).astype(np.float32)
    assert out.shape[0] == 15
    return out


__all__ = ["FEATURE_NAMES", "extract"]