File size: 23,264 Bytes

2b738f4

"""Limit Order Book (LOB) Reconstruction and Level 2 Features

What Jane Street sees that retail doesn't:
- Full Level 2 order book (10+ price levels, not just best bid/ask)
- Queue position for each order
- Order arrival/cancel rates
- Market depth profile
- Spread dynamics (widening = informed trading)
- Large order detection

This is the foundation of HIGH-FREQUENCY alpha.
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, NamedTuple
from collections import defaultdict
import bisect
import warnings
warnings.filterwarnings('ignore')


class OrderBookEntry:
    """Single entry in the order book"""
    def __init__(self, price: float, quantity: int, order_id: str,
                 side: str, timestamp: float):
        self.price = price
        self.quantity = quantity
        self.order_id = order_id
        self.side = side  # 'bid' or 'ask'
        self.timestamp = timestamp
    
    def __repr__(self):
        return f"{self.side.upper()} {self.quantity}@{self.price:.2f}"


class LimitOrderBook:
    """
    Full Limit Order Book reconstruction from message feed.
    
    Jane Street processes millions of these per second.
    Key insight: The order book itself CONTAINS alpha.
    - Large orders at round numbers = resistance/support
    - Order imbalance predicts next price move (30ms ahead)
    - Spread dynamics = informed vs uninformed flow
    """
    
    def __init__(self, max_depth: int = 10):
        self.max_depth = max_depth
        self.bids = {}  # price -> list of OrderBookEntry
        self.asks = {}  # price -> list of OrderBookEntry
        self.bid_prices = []  # Sorted descending
        self.ask_prices = []  # Sorted ascending
        self.order_map = {}  # order_id -> (side, price)
        
        # Statistics
        self.trade_history = []
        self.imbalance_history = []
        self.spread_history = []
        self.depth_history = []
    
    def add_order(self, order: OrderBookEntry):
        """Add a limit order"""
        side_dict = self.bids if order.side == 'bid' else self.asks
        price_list = self.bid_prices if order.side == 'bid' else self.ask_prices
        
        if order.price not in side_dict:
            side_dict[order.price] = []
            bisect.insort(price_list, order.price)
            if order.side == 'bid':
                price_list.sort(reverse=True)
        
        side_dict[order.price].append(order)
        self.order_map[order.order_id] = (order.side, order.price)
    
    def cancel_order(self, order_id: str):
        """Cancel a limit order"""
        if order_id not in self.order_map:
            return False
        
        side, price = self.order_map[order_id]
        side_dict = self.bids if side == 'bid' else self.asks
        
        if price in side_dict:
            side_dict[price] = [o for o in side_dict[price] if o.order_id != order_id]
            if not side_dict[price]:
                del side_dict[price]
                price_list = self.bid_prices if side == 'bid' else self.ask_prices
                price_list.remove(price)
        
        del self.order_map[order_id]
        return True
    
    def execute_trade(self, side: str, quantity: int, 
                      aggressive: bool = True) -> Tuple[float, int]:
        """
        Execute a market order against the book.
        
        aggressive=True: market order (crosses spread)
        aggressive=False: limit order that hits
        
        Returns: (avg_price, executed_qty)
        """
        remaining = quantity
        total_cost = 0.0
        
        # Match against opposite side
        opposite = 'ask' if side == 'bid' else 'bid'
        opposite_dict = self.asks if opposite == 'ask' else self.bids
        price_list = self.ask_prices if opposite == 'ask' else self.bid_prices
        
        while remaining > 0 and price_list:
            best_price = price_list[0]
            
            if best_price not in opposite_dict:
                price_list.pop(0)
                continue
            
            level_orders = opposite_dict[best_price]
            
            while remaining > 0 and level_orders:
                order = level_orders[0]
                exec_qty = min(remaining, order.quantity)
                
                total_cost += exec_qty * best_price
                remaining -= exec_qty
                order.quantity -= exec_qty
                
                if order.quantity <= 0:
                    level_orders.pop(0)
                    if order.order_id in self.order_map:
                        del self.order_map[order.order_id]
            
            if not level_orders:
                del opposite_dict[best_price]
                price_list.pop(0)
        
        executed = quantity - remaining
        avg_price = total_cost / executed if executed > 0 else 0.0
        
        # Record trade
        if executed > 0:
            self.trade_history.append({
                'side': side,
                'quantity': executed,
                'avg_price': avg_price,
                'aggressive': aggressive
            })
        
        return avg_price, executed
    
    def get_best_bid(self) -> Optional[float]:
        return self.bid_prices[0] if self.bid_prices else None
    
    def get_best_ask(self) -> Optional[float]:
        return self.ask_prices[0] if self.ask_prices else None
    
    def get_mid_price(self) -> Optional[float]:
        bb = self.get_best_bid()
        ba = self.get_best_ask()
        if bb is not None and ba is not None:
            return (bb + ba) / 2
        return None
    
    def get_spread(self) -> Optional[float]:
        bb = self.get_best_bid()
        ba = self.get_best_ask()
        if bb is not None and ba is not None:
            return ba - bb
        return None
    
    def get_spread_bps(self) -> Optional[float]:
        spread = self.get_spread()
        mid = self.get_mid_price()
        if spread is not None and mid is not None:
            return (spread / mid) * 10000
        return None
    
    def get_book_snapshot(self, depth: Optional[int] = None) -> Dict:
        """Get a snapshot of the full book"""
        depth = depth or self.max_depth
        
        bids_snapshot = []
        for p in self.bid_prices[:depth]:
            if p in self.bids:
                total_qty = sum(o.quantity for o in self.bids[p])
                num_orders = len(self.bids[p])
                bids_snapshot.append({
                    'price': p,
                    'quantity': total_qty,
                    'num_orders': num_orders,
                    'side': 'bid'
                })
        
        asks_snapshot = []
        for p in self.ask_prices[:depth]:
            if p in self.asks:
                total_qty = sum(o.quantity for o in self.asks[p])
                num_orders = len(self.asks[p])
                asks_snapshot.append({
                    'price': p,
                    'quantity': total_qty,
                    'num_orders': num_orders,
                    'side': 'ask'
                })
        
        return {
            'bids': bids_snapshot,
            'asks': asks_snapshot,
            'mid_price': self.get_mid_price(),
            'spread': self.get_spread(),
            'spread_bps': self.get_spread_bps(),
            'bid_depth': len(self.bid_prices),
            'ask_depth': len(self.ask_prices),
            'total_bid_quantity': sum(sum(o.quantity for o in self.bids[p]) 
                                         for p in self.bid_prices),
            'total_ask_quantity': sum(sum(o.quantity for o in self.asks[p])
                                         for p in self.ask_prices)
        }
    
    def get_order_imbalance(self, levels: int = 5) -> float:
        """
        Order imbalance at top N levels.
        
        Positive = more buying interest (bullish short-term)
        Negative = more selling interest (bearish short-term)
        
        Jane Street's #1 short-term signal.
        """
        bid_qty = sum(
            sum(o.quantity for o in self.bids[p])
            for p in self.bid_prices[:levels] if p in self.bids
        )
        
        ask_qty = sum(
            sum(o.quantity for o in self.asks[p])
            for p in self.ask_prices[:levels] if p in self.asks
        )
        
        total = bid_qty + ask_qty
        if total == 0:
            return 0.0
        
        return (bid_qty - ask_qty) / total


class LOBFeatures:
    """
    Extract institutional-grade features from reconstructed LOB.
    
    These features predict price movements 1-100ms ahead.
    This is the EDGE that makes Jane Street profitable.
    """
    
    @staticmethod
    def price_levels(book: LimitOrderBook, n: int = 10) -> pd.DataFrame:
        """Price level data (Level 2 equivalent)"""
        snapshot = book.get_book_snapshot(depth=n)
        
        rows = []
        
        # Bids (from best to worst)
        for i, level in enumerate(snapshot['bids']):
            rows.append({
                'side': 'bid',
                'level': i + 1,
                'price': level['price'],
                'quantity': level['quantity'],
                'num_orders': level['num_orders']
            })
        
        # Asks
        for i, level in enumerate(snapshot['asks']):
            rows.append({
                'side': 'ask',
                'level': i + 1,
                'price': level['price'],
                'quantity': level['quantity'],
                'num_orders': level['num_orders']
            })
        
        return pd.DataFrame(rows)
    
    @staticmethod
    def depth_profile(book: LimitOrderBook) -> Dict:
        """
        Market depth profile across price levels.
        
        Skewed depth (more on one side) predicts price direction.
        """
        snapshot = book.get_book_snapshot()
        
        bids = snapshot['bids']
        asks = snapshot['asks']
        
        # Cumulative depth
        cum_bid_qty = np.cumsum([b['quantity'] for b in bids])
        cum_ask_qty = np.cumsum([a['quantity'] for a in asks])
        
        # Price distance from mid
        mid = snapshot['mid_price'] or 0
        bid_distances = [mid - b['price'] for b in bids]
        ask_distances = [a['price'] - mid for a in asks]
        
        return {
            'bid_depth_1': cum_bid_qty[0] if len(cum_bid_qty) > 0 else 0,
            'bid_depth_5': cum_bid_qty[4] if len(cum_bid_qty) > 4 else cum_bid_qty[-1] if len(cum_bid_qty) > 0 else 0,
            'bid_depth_10': cum_bid_qty[9] if len(cum_bid_qty) > 9 else cum_bid_qty[-1] if len(cum_bid_qty) > 0 else 0,
            'ask_depth_1': cum_ask_qty[0] if len(cum_ask_qty) > 0 else 0,
            'ask_depth_5': cum_ask_qty[4] if len(cum_ask_qty) > 4 else cum_ask_qty[-1] if len(cum_ask_qty) > 0 else 0,
            'ask_depth_10': cum_ask_qty[9] if len(cum_ask_qty) > 9 else cum_ask_qty[-1] if len(cum_ask_qty) > 0 else 0,
            'depth_ratio_1': (cum_bid_qty[0] / cum_ask_qty[0]) if len(cum_bid_qty) > 0 and len(cum_ask_qty) > 0 and cum_ask_qty[0] > 0 else 1.0,
            'depth_ratio_5': (cum_bid_qty[4] / cum_ask_qty[4]) if len(cum_bid_qty) > 4 and len(cum_ask_qty) > 4 and cum_ask_qty[4] > 0 else 1.0,
            'depth_skew': (snapshot['total_bid_quantity'] - snapshot['total_ask_quantity']) / 
                          (snapshot['total_bid_quantity'] + snapshot['total_ask_quantity'] + 1)
        }
    
    @staticmethod
    def queue_features(book: LimitOrderBook) -> Dict:
        """
        Queue position features.
        
        Being at the FRONT of the queue means you get filled first = better price.
        Queue length = how long you wait.
        """
        snapshot = book.get_book_snapshot(depth=1)
        
        best_bid = snapshot['bids'][0] if snapshot['bids'] else None
        best_ask = snapshot['asks'][0] if snapshot['asks'] else None
        
        return {
            'bid_queue_length': best_bid['num_orders'] if best_bid else 0,
            'ask_queue_length': best_ask['num_orders'] if best_ask else 0,
            'bid_queue_qty': best_bid['quantity'] if best_bid else 0,
            'ask_queue_qty': best_ask['quantity'] if best_ask else 0,
            'queue_imbalance': ((best_bid['num_orders'] if best_bid else 0) - 
                               (best_ask['num_orders'] if best_ask else 0))
        }
    
    @staticmethod
    def large_order_detection(book: LimitOrderBook, 
                              threshold_qty: float = 1000,
                              threshold_pct: float = 0.3) -> List[Dict]:
        """
        Detect unusually large orders.
        
        Large orders = informed traders or iceberg orders.
        Can predict price movements.
        """
        snapshot = book.get_book_snapshot()
        large_orders = []
        
        for side, side_name in [(book.bids, 'bid'), (book.asks, 'ask')]:
            for price, orders in side.items():
                total_at_price = sum(o.quantity for o in orders)
                avg_qty = np.mean([o.quantity for o in orders]) if orders else 0
                
                for order in orders:
                    if order.quantity >= threshold_qty:
                        large_orders.append({
                            'side': side_name,
                            'price': price,
                            'quantity': order.quantity,
                            'pct_of_level': order.quantity / total_at_price if total_at_price > 0 else 0,
                            'is_iceberg': order.quantity > avg_qty * 3  # Likely iceberg
                        })
        
        return sorted(large_orders, key=lambda x: x['quantity'], reverse=True)
    
    @staticmethod
    def spread_dynamics(book_history: List[LimitOrderBook], 
                          window: int = 10) -> Dict:
        """
        Spread dynamics over time.
        
        Widening spread = uncertainty, less liquidity, informed trading.
        Narrowing spread = confidence, more liquidity.
        """
        spreads = []
        mids = []
        imbalances = []
        
        for book in book_history[-window:]:
            s = book.get_spread_bps()
            m = book.get_mid_price()
            i = book.get_order_imbalance()
            
            if s is not None:
                spreads.append(s)
            if m is not None:
                mids.append(m)
            imbalances.append(i)
        
        if len(spreads) < 2:
            return {}
        
        return {
            'avg_spread_bps': np.mean(spreads),
            'spread_volatility': np.std(spreads),
            'spread_trend': spreads[-1] - spreads[0],
            'spread_percentile': sum(1 for s in spreads if s <= spreads[-1]) / len(spreads),
            'mid_price_change_pct': (mids[-1] / mids[0] - 1) * 100 if len(mids) >= 2 and mids[0] > 0 else 0,
            'avg_imbalance': np.mean(imbalances),
            'imbalance_trend': imbalances[-1] - imbalances[0]
        }
    
    @staticmethod
    def order_flow_-toxicity(book: LimitOrderBook, 
                              trade_history: List[Dict],
                              window: int = 50) -> Dict:
        """
        VPIN-like metric using LOB data.
        
        Toxic flow = aggressive orders that consume liquidity.
        High toxicity = informed trading = adverse selection.
        """
        if not trade_history:
            return {'vpin_approx': 0.0, 'toxicity': 0.0}
        
        recent_trades = trade_history[-window:]
        
        # Classify trades as aggressive buyer or seller
        # (Simplified: if trade near ask = buyer aggressive)
        mid = book.get_mid_price()
        
        buy_volume = sum(t['quantity'] for t in recent_trades 
                        if t.get('side') == 'bid' or t.get('aggressive', False))
        sell_volume = sum(t['quantity'] for t in recent_trades 
                         if t.get('side') == 'ask' or not t.get('aggressive', False))
        
        total = buy_volume + sell_volume
        if total == 0:
            return {'vpin_approx': 0.0, 'toxicity': 0.0}
        
        # Toxicity = |buy_vol - sell_vol| / total
        vpin = abs(buy_volume - sell_volume) / total
        
        return {
            'vpin_approx': vpin,
            'toxicity': vpin,
            'buy_volume': buy_volume,
            'sell_volume': sell_volume,
            'total_volume': total
        }
    
    @staticmethod
    def all_features(book: LimitOrderBook, 
                     book_history: Optional[List[LimitOrderBook]] = None) -> Dict:
        """Compute all LOB features at once"""
        features = {}
        
        # Basic features
        snapshot = book.get_book_snapshot()
        features['mid_price'] = snapshot['mid_price']
        features['spread'] = snapshot['spread']
        features['spread_bps'] = snapshot['spread_bps']
        features['bid_depth_total'] = snapshot['total_bid_quantity']
        features['ask_depth_total'] = snapshot['total_ask_quantity']
        features['depth_imbalance'] = book.get_order_imbalance()
        
        # Depth profile
        depth = LOBFeatures.depth_profile(book)
        features.update({f'depth_{k}': v for k, v in depth.items()})
        
        # Queue features
        queue = LOBFeatures.queue_features(book)
        features.update({f'queue_{k}': v for k, v in queue.items()})
        
        # Large orders
        large = LOBFeatures.large_order_detection(book)
        features['n_large_orders'] = len(large)
        features['large_order_total_qty'] = sum(o['quantity'] for o in large)
        
        # Spread dynamics
        if book_history and len(book_history) >= 2:
            dynamics = LOBFeatures.spread_dynamics(book_history)
            features.update({f'spread_dyn_{k}': v for k, v in dynamics.items()})
        
        return features


def generate_synthetic_lob_feed(n_messages: int = 1000,
                                  base_price: float = 100.0,
                                  tick_size: float = 0.01) -> List[Dict]:
    """Generate synthetic LOB message feed for testing"""
    np.random.seed(42)
    
    messages = []
    order_counter = 0
    
    # Initialize with some orders
    for _ in range(50):
        side = 'bid' if np.random.rand() < 0.5 else 'ask'
        price = base_price + np.random.randint(-50, 50) * tick_size
        if side == 'ask':
            price = max(price, base_price)
        else:
            price = min(price, base_price)
        
        messages.append({
            'type': 'add',
            'order_id': f'order_{order_counter}',
            'side': side,
            'price': round(price, 2),
            'quantity': np.random.randint(100, 1000),
            'timestamp': len(messages) / 1000.0
        })
        order_counter += 1
    
    # Generate flowing messages
    for _ in range(n_messages - 50):
        msg_type = np.random.choice(['add', 'cancel', 'trade'], p=[0.5, 0.3, 0.2])
        
        if msg_type == 'add':
            side = 'bid' if np.random.rand() < 0.5 else 'ask'
            offset = np.random.exponential(10) * tick_size
            price = base_price + (offset if side == 'ask' else -offset)
            price = round(max(price, 0.01), 2)
            
            messages.append({
                'type': 'add',
                'order_id': f'order_{order_counter}',
                'side': side,
                'price': price,
                'quantity': np.random.randint(100, 2000),
                'timestamp': len(messages) / 1000.0
            })
            order_counter += 1
            
        elif msg_type == 'cancel' and order_counter > 0:
            # Cancel a random existing order
            messages.append({
                'type': 'cancel',
                'order_id': f'order_{np.random.randint(0, order_counter)}',
                'timestamp': len(messages) / 1000.0
            })
            
        else:
            # Trade
            side = 'bid' if np.random.rand() < 0.5 else 'ask'
            messages.append({
                'type': 'trade',
                'side': side,
                'quantity': np.random.randint(100, 500),
                'timestamp': len(messages) / 1000.0
            })
    
    return messages


def process_message_feed(messages: List[Dict]) -> Tuple[LimitOrderBook, List[Dict]]:
    """Process a message feed and reconstruct the LOB"""
    book = LimitOrderBook(max_depth=20)
    trades = []
    book_history = []
    features_history = []
    
    for msg in messages:
        if msg['type'] == 'add':
            entry = OrderBookEntry(
                price=msg['price'],
                quantity=msg['quantity'],
                order_id=msg['order_id'],
                side=msg['side'],
                timestamp=msg['timestamp']
            )
            book.add_order(entry)
            
        elif msg['type'] == 'cancel':
            book.cancel_order(msg['order_id'])
            
        elif msg['type'] == 'trade':
            side = 'bid' if msg['side'] == 'ask' else 'ask'  # Opposite side
            avg_price, qty = book.execute_trade(side, msg['quantity'], aggressive=True)
            trades.append({
                'timestamp': msg['timestamp'],
                'side': msg['side'],
                'quantity': qty,
                'avg_price': avg_price
            })
        
        # Snapshot every 100 messages
        if len(book_history) % 100 == 0:
            book_history.append(book)
            features = LOBFeatures.all_features(book, book_history)
            features['timestamp'] = msg['timestamp']
            features_history.append(features)
    
    return book, trades, features_history


if __name__ == '__main__':
    print("=" * 70)
    print("  LIMIT ORDER BOOK RECONSTRUCTION")
    print("=" * 70)
    
    # Generate synthetic data
    messages = generate_synthetic_lob_feed(n_messages=5000)
    
    # Process
    book, trades, features = process_message_feed(messages)
    
    # Final snapshot
    snapshot = book.get_book_snapshot(depth=5)
    
    print(f"\nFinal LOB State:")
    print(f"  Mid Price: ${snapshot['mid_price']:.2f}")
    print(f"  Spread: {snapshot['spread_bps']:.1f} bps")
    print(f"  Bid Depth: {snapshot['bid_depth']} levels")
    print(f"  Ask Depth: {snapshot['ask_depth']} levels")
    print(f"  Total Bid Qty: {snapshot['total_bid_quantity']:,}")
    print(f"  Total Ask Qty: {snapshot['total_ask_quantity']:,}")
    print(f"  Order Imbalance: {book.get_order_imbalance():.3f}")
    
    # Level 2
    print(f"\nLevel 2 Book (top 5):")
    levels = LOBFeatures.price_levels(book, n=5)
    print(levels.to_string())
    
    # Features
    if features:
        print(f"\nLatest LOB Features:")
        latest = features[-1]
        for k, v in latest.items():
            if isinstance(v, (int, float)):
                print(f"  {k}: {v:.4f}")
    
    print(f"\n  Trades executed: {len(trades)}")
    print(f"  Total messages processed: {len(messages)}")
    print(f"\n  This is what Jane Street sees every microsecond.")
    print(f"  Order imbalance, queue position, depth profile = PURE ALPHA.")