File size: 9,762 Bytes

import os
import argparse
from typing import List, Optional, Sequence, Tuple

from dotenv import load_dotenv
from clickhouse_driver import Client as ClickHouseClient


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Fast SQL-based hyperparameter analysis (trades-only) for seq_len + horizons."
    )
    parser.add_argument("--token_address", type=str, default=None, help="Analyze a single token address.")
    parser.add_argument(
        "--windows_min",
        type=str,
        default="5,10,30,60",
        help="Comma-separated trade-count windows in minutes (e.g. '5,10,30,60').",
    )
    parser.add_argument(
        "--min_price_usd",
        type=float,
        default=0.0,
        help="Treat trades with price_usd <= min_price_usd as invalid (default: 0.0).",
    )
    return parser.parse_args()


def _parse_windows(windows_min: str) -> List[int]:
    out: List[int] = []
    for part in (windows_min or "").split(","):
        part = part.strip()
        if not part:
            continue
        out.append(int(part))
    out = sorted(set([w for w in out if w > 0]))
    if not out:
        raise ValueError("No valid --windows_min provided.")
    return out


def _connect_clickhouse_from_env() -> ClickHouseClient:
    ch_host = os.getenv("CLICKHOUSE_HOST", "localhost")
    ch_port = int(os.getenv("CLICKHOUSE_NATIVE_PORT", "9000"))
    ch_user = os.getenv("CLICKHOUSE_USER", None)
    ch_pass = os.getenv("CLICKHOUSE_PASSWORD", None)
    ch_db = os.getenv("CLICKHOUSE_DB", None)

    kwargs = {"host": ch_host, "port": ch_port}
    if ch_user:
        kwargs["user"] = ch_user
    if ch_pass:
        kwargs["password"] = ch_pass
    if ch_db:
        kwargs["database"] = ch_db
    return ClickHouseClient(**kwargs)


def _quantile_levels() -> Sequence[float]:
    # Keep these aligned with the printed labels below.
    return (0.25, 0.5, 0.75, 0.90, 0.95, 0.99)


def _fmt_q_tuple(q: Tuple[float, ...]) -> str:
    # Labels match _quantile_levels()
    labels = ["25%", "50%", "75%", "90%", "95%", "99%"]
    parts = []
    for lbl, v in zip(labels, q):
        parts.append(f"{lbl}: {float(v):.2f}")
    return " | ".join(parts)


def _print_row(prefix: str, mean_v: float, q_tuple: Tuple[float, ...], max_v: float) -> None:
    print(f"[{prefix}]")
    print(f"  Mean: {float(mean_v):.2f} | Median: {float(q_tuple[1]):.2f} | Max: {float(max_v):.2f}")
    print(f"  {_fmt_q_tuple(q_tuple)}")


def fetch_aggregated_stats_sql(
    ch: ClickHouseClient,
    windows_min: List[int],
    min_price_usd: float,
    token_address: Optional[str] = None,
) -> List[tuple]:
    """
    One ClickHouse query that computes distribution statistics directly (no per-token loop in Python).
    Returns two groups:
      - grp='all'
      - grp='subset' where trades_full > 50 and lifespan_sec > 300 (5 minutes)
    """
    q_levels = _quantile_levels()
    q_levels_sql = ", ".join(str(q) for q in q_levels)

    per_token_window_exprs = []
    agg_window_exprs = []
    for w in windows_min:
        sec = int(w) * 60
        per_token_window_exprs.append(
            f"countIf(is_valid AND (trade_ts - mint_ts) <= {sec}) AS trades_{w}m"
        )
        agg_window_exprs.append(
            f"avg(trades_{w}m) AS trades_{w}m_mean,"
            f" quantilesExact({q_levels_sql})(trades_{w}m) AS trades_{w}m_q,"
            f" max(trades_{w}m) AS trades_{w}m_max"
        )

    params = {"min_price": float(min_price_usd)}
    token_filter = ""
    if token_address:
        token_filter = "AND m.mint_address = %(token)s"
        params["token"] = token_address

    # Note: we pre-filter trades to only minted tokens for speed.
    query = f"""
    WITH
      per_token AS (
        SELECT
          m.mint_address AS mint_address,
          toUnixTimestamp(m.timestamp) AS mint_ts,
          countIf(is_valid) AS trades_full,
          (maxIf(trade_ts, is_valid) - mint_ts) AS lifespan_sec,
          (toUnixTimestamp(argMaxIf(t.timestamp, t.price_usd, is_valid)) - mint_ts) AS time_to_ath_sec,
          {", ".join(per_token_window_exprs)}
        FROM mints AS m
        INNER JOIN
        (
          SELECT
            base_address,
            timestamp,
            toUnixTimestamp(timestamp) AS trade_ts,
            price_usd,
            (price_usd > %(min_price)s) AS is_valid
          FROM trades
          WHERE base_address IN (SELECT mint_address FROM mints)
        ) AS t
        ON t.base_address = m.mint_address
        WHERE 1=1
          {token_filter}
        GROUP BY
          mint_address,
          mint_ts
        HAVING
          trades_full > 0
      )
    SELECT
      grp,
      count() AS tokens,

      avg(trades_full) AS trades_full_mean,
      quantilesExact({q_levels_sql})(trades_full) AS trades_full_q,
      max(trades_full) AS trades_full_max,

      avg(lifespan_sec / 60.0) AS lifespan_min_mean,
      quantilesExact({q_levels_sql})(lifespan_sec / 60.0) AS lifespan_min_q,
      max(lifespan_sec / 60.0) AS lifespan_min_max,

      avg(time_to_ath_sec / 60.0) AS tta_min_mean,
      quantilesExact({q_levels_sql})(time_to_ath_sec / 60.0) AS tta_min_q,
      max(time_to_ath_sec / 60.0) AS tta_min_max,

      {", ".join(agg_window_exprs)}
    FROM per_token
    ARRAY JOIN ['all', 'subset'] AS grp
    WHERE (grp = 'all')
       OR (grp = 'subset' AND trades_full > 50 AND lifespan_sec > 300)
    GROUP BY grp
    ORDER BY grp
    """

    return ch.execute(query, params)


def fetch_single_token_sql(
    ch: ClickHouseClient,
    windows_min: List[int],
    min_price_usd: float,
    token_address: str,
) -> Optional[tuple]:
    per_token_window_exprs = []
    for w in windows_min:
        sec = int(w) * 60
        per_token_window_exprs.append(
            f"countIf(is_valid AND (trade_ts - mint_ts) <= {sec}) AS trades_{w}m"
        )

    params = {"min_price": float(min_price_usd), "token": token_address}
    query = f"""
    SELECT
      m.mint_address AS mint_address,
      toUnixTimestamp(m.timestamp) AS mint_ts,
      countIf(is_valid) AS trades_full,
      (maxIf(trade_ts, is_valid) - mint_ts) AS lifespan_sec,
      (toUnixTimestamp(argMaxIf(t.timestamp, t.price_usd, is_valid)) - mint_ts) AS time_to_ath_sec,
      {", ".join(per_token_window_exprs)}
    FROM mints AS m
    INNER JOIN
    (
      SELECT
        base_address,
        timestamp,
        toUnixTimestamp(timestamp) AS trade_ts,
        price_usd,
        (price_usd > %(min_price)s) AS is_valid
      FROM trades
      WHERE base_address = %(token)s
    ) AS t
    ON t.base_address = m.mint_address
    WHERE m.mint_address = %(token)s
    GROUP BY
      mint_address,
      mint_ts
    HAVING
      trades_full > 0
    """
    rows = ch.execute(query, params)
    return rows[0] if rows else None


def main() -> None:
    load_dotenv()
    args = parse_args()
    windows_min = _parse_windows(args.windows_min)

    print("--- Hyperparameter Calibration Analysis (FAST SQL) ---")
    print(f"Windows (min): {windows_min}")
    print(f"Valid trade filter: price_usd > {float(args.min_price_usd)}")

    ch = _connect_clickhouse_from_env()
    if args.token_address:
        row = fetch_single_token_sql(
            ch=ch,
            windows_min=windows_min,
            min_price_usd=float(args.min_price_usd),
            token_address=args.token_address,
        )
        if not row:
            print("Token not found (or no valid trades).")
            return

        mint_addr = row[0]
        trades_full = int(row[2])
        lifespan_min = float(row[3]) / 60.0
        tta_min = float(row[4]) / 60.0
        print("\n" + "=" * 40)
        print("RESULTS (SINGLE TOKEN)")
        print("=" * 40)
        print(f"Token: {mint_addr}")
        print(f"Valid trades: {trades_full}")
        print(f"Lifespan (min): {lifespan_min:.2f}")
        print(f"Time to ATH (min): {tta_min:.2f}")
        for i, w in enumerate(windows_min):
            print(f"Trades in first {w}m: {int(row[5 + i])}")
    else:
        rows = fetch_aggregated_stats_sql(
            ch=ch,
            windows_min=windows_min,
            min_price_usd=float(args.min_price_usd),
            token_address=None,
        )
        if not rows:
            print("No tokens found with valid trades.")
            return

        print("\n" + "=" * 40)
        print("RESULTS (DISTRIBUTION)")
        print("=" * 40)

        # Row layout:
        # grp, tokens,
        # trades_full_mean, trades_full_q(tuple), trades_full_max,
        # lifespan_min_mean, lifespan_min_q(tuple), lifespan_min_max,
        # tta_min_mean, tta_min_q(tuple), tta_min_max,
        # repeated for each window: mean, q(tuple), max
        for row in rows:
            grp = row[0]
            tokens = int(row[1])
            print(f"\n--- Group: {grp} (tokens={tokens}) ---")

            _print_row("Trades (Full History, Valid Only)", row[2], row[3], row[4])
            print("")
            _print_row("Token Lifespan (Minutes)", row[5], row[6], row[7])
            print("")
            _print_row("Time to ATH (Minutes)", row[8], row[9], row[10])

            cursor = 11
            for w in windows_min:
                mean_v = row[cursor]
                q_v = row[cursor + 1]
                max_v = row[cursor + 2]
                cursor += 3
                print("")
                _print_row(f"Trades in First {w} Minutes (Valid Only)", mean_v, q_v, max_v)

    print("\nRecommendation Logic (Trades-only):")
    print("- Horizons: look at Time-to-ATH p90/p95 (all vs subset).")
    print("- Max seq len: look at Trades-in-first-(max horizon) p95/p99.")
    print("  Then add headroom for non-trade events (transfers/pool/liquidity/etc).")


if __name__ == "__main__":
    main()