Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

data/data_loader.py +31 -17
data/ohlc_stats.npz +1 -1
inference.py +1 -1
log.log +2 -2
models/wallet_encoder.py +8 -10
sample_DYtPmhyxPDbMEdVP_0.json +0 -0
scripts/analyze_distribution.py +49 -0
scripts/cache_dataset.py +8 -4

data/data_loader.py CHANGED Viewed

@@ -128,9 +128,22 @@ class OracleDataset(Dataset):
                  cache_dir: Optional[Union[str, Path]] = None,
                  start_date: Optional[datetime.datetime] = None,
                  min_trade_usd: float = 0.0,
-                 max_seq_len: int = 8192):
         self.max_seq_len = max_seq_len
         # --- NEW: Create a persistent requests session for efficiency ---
         # Configure robust HTTP session
@@ -633,10 +646,18 @@ class OracleDataset(Dataset):
             sell_count = sum(1 for e in trades_win if e.get('trade_direction') == 1)
             volume = sum(float(e.get('total_usd', 0.0) or 0.0) for e in trades_win)
             total_txns = len(trades_win) + len(xfers_win)
-            global_fees_paid = sum(float(e.get('priority_fee', 0.0) or 0.0) for e in trades_win) + \
-                               sum(float(e.get('priority_fee', 0.0) or 0.0) for e in xfers_win)
-            smart_trader_addrs = set(e['wallet_address'] for e in trades_win if e.get('event_type') == 'SmartWallet_Trade')
             smart_traders = len(smart_trader_addrs)
             kol_addrs = set()
@@ -825,7 +846,7 @@ class OracleDataset(Dataset):
             # --- Define all expected numerical keys for a profile ---
             expected_profile_keys = [
-                'age', 'deployed_tokens_count', 'deployed_tokens_migrated_pct',
                 'deployed_tokens_avg_lifetime_sec', 'deployed_tokens_avg_peak_mc_usd',
                 'deployed_tokens_median_peak_mc_usd', 'balance', 'transfers_in_count',
                 'transfers_out_count', 'spl_transfers_in_count', 'spl_transfers_out_count',
@@ -852,14 +873,7 @@ class OracleDataset(Dataset):
             social_data['has_telegram'] = bool(social_data.get('telegram_channel'))
             social_data['is_exchange_wallet'] = 'exchange_wallet' in profile_data.get('tags', [])
-            # --- Calculate 'age' based on user's logic ---
-            funded_ts = profile_data.get('funded_timestamp', 0)
-            if funded_ts and funded_ts > 0:
-                age_seconds = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) - funded_ts
-            else:
-                age_seconds = 12_960_000
-            profile_data['age'] = float(age_seconds)
             username = social_data.get('pumpfun_username') or social_data.get('twitter_username') or social_data.get('kolscan_name')
@@ -909,12 +923,12 @@ class OracleDataset(Dataset):
                     'mint_address': mint_addr,
                     'holding_time': float(holding_item.get('holding_time', 0.0) or 0.0),
                     'balance_pct_to_supply': min(1.0, float(holding_item.get('balance_pct_to_supply', 0.0) or 0.0)),
-                    'history_bought_cost_sol': min(1000.0, float(holding_item.get('history_bought_cost_sol', 0.0) or 0.0)),
                     'bought_amount_sol_pct_to_native_balance': min(1.0, float(holding_item.get('bought_amount_sol_pct_to_native_balance', 0.0) or 0.0)),
                     'history_total_buys': float(holding_item.get('history_total_buys', 0.0) or 0.0),
                     'history_total_sells': float(holding_item.get('history_total_sells', 0.0) or 0.0),
                     'realized_profit_pnl': float(holding_item.get('realized_profit_pnl', 0.0) or 0.0),
-                    'realized_profit_sol': max(-1000.0, min(1000.0, float(holding_item.get('realized_profit_sol', 0.0) or 0.0))),
                     'history_transfer_in': float(holding_item.get('history_transfer_in', 0.0) or 0.0),
                     'history_transfer_out': float(holding_item.get('history_transfer_out', 0.0) or 0.0),
                     'avarage_trade_gap_seconds': float(holding_item.get('avarage_trade_gap_seconds', 0.0) or 0.0),
@@ -926,7 +940,7 @@ class OracleDataset(Dataset):
             compact_profile = {'wallet_address': addr}
             for key in expected_profile_keys:
                 compact_profile[key] = float(profile_data.get(key, 0.0) or 0.0)
-            compact_profile['age'] = float(profile_data.get('age', 0.0) or 0.0)
             compact_social = {
                 'has_pf_profile': bool(social_data.get('has_pf_profile', False)),
@@ -2073,7 +2087,7 @@ class OracleDataset(Dataset):
                 'mev_protection': 1 if trade.get('mev_protection', 0) > 0 else 0,
                 'token_amount_pct_of_holding': token_pct_hold,
                 'quote_amount_pct_of_holding': quote_pct_hold,
-                'slippage': min(10.0, float(trade.get('slippage', 0.0) or 0.0)),
                 'token_amount_pct_to_total_supply': token_pct_supply,
                 'success': is_success,
                 'is_bundle': trade.get('is_bundle', False),

                  cache_dir: Optional[Union[str, Path]] = None,
                  start_date: Optional[datetime.datetime] = None,
                  min_trade_usd: float = 0.0,
+                 max_seq_len: int = 8192,
+                 p99_clamps: Optional[Dict[str, float]] = None):
         self.max_seq_len = max_seq_len
+        # --- P99 data-driven clamp values (replace hardcoded min/max) ---
+        self.p99_clamps = {
+            'slippage': 1.0,
+            'priority_fee': 0.1,
+            'total_usd': 100000.0,
+            'history_bought_cost_sol': 30.0,
+            'realized_profit_sol': 150.0,
+        }
+        if p99_clamps:
+            self.p99_clamps.update(p99_clamps)
+            print(f"INFO: Using P99 clamps: {self.p99_clamps}")
         # --- NEW: Create a persistent requests session for efficiency ---
         # Configure robust HTTP session
             sell_count = sum(1 for e in trades_win if e.get('trade_direction') == 1)
             volume = sum(float(e.get('total_usd', 0.0) or 0.0) for e in trades_win)
             total_txns = len(trades_win) + len(xfers_win)
+            global_fees_paid = sum(
+                float(e.get('priority_fee', 0.0) or 0.0) + float(e.get('bribe_fee', 0.0) or 0.0)
+                for e in trades_win
+            )
+            smart_trader_addrs = set(
+                e['wallet_address'] for e in trade_events
+                if e.get('event_type') == 'SmartWallet_Trade'
+                and e.get('success', False)
+                and e['timestamp'] <= ts_value
+                and holder_pct_map_ts.get(e['wallet_address'], 0.0) > 0.0
+            )
             smart_traders = len(smart_trader_addrs)
             kol_addrs = set()
             # --- Define all expected numerical keys for a profile ---
             expected_profile_keys = [
+                'deployed_tokens_count', 'deployed_tokens_migrated_pct',
                 'deployed_tokens_avg_lifetime_sec', 'deployed_tokens_avg_peak_mc_usd',
                 'deployed_tokens_median_peak_mc_usd', 'balance', 'transfers_in_count',
                 'transfers_out_count', 'spl_transfers_in_count', 'spl_transfers_out_count',
             social_data['has_telegram'] = bool(social_data.get('telegram_channel'))
             social_data['is_exchange_wallet'] = 'exchange_wallet' in profile_data.get('tags', [])
             username = social_data.get('pumpfun_username') or social_data.get('twitter_username') or social_data.get('kolscan_name')
                     'mint_address': mint_addr,
                     'holding_time': float(holding_item.get('holding_time', 0.0) or 0.0),
                     'balance_pct_to_supply': min(1.0, float(holding_item.get('balance_pct_to_supply', 0.0) or 0.0)),
+                    'history_bought_cost_sol': min(self.p99_clamps['history_bought_cost_sol'], float(holding_item.get('history_bought_cost_sol', 0.0) or 0.0)),
                     'bought_amount_sol_pct_to_native_balance': min(1.0, float(holding_item.get('bought_amount_sol_pct_to_native_balance', 0.0) or 0.0)),
                     'history_total_buys': float(holding_item.get('history_total_buys', 0.0) or 0.0),
                     'history_total_sells': float(holding_item.get('history_total_sells', 0.0) or 0.0),
                     'realized_profit_pnl': float(holding_item.get('realized_profit_pnl', 0.0) or 0.0),
+                    'realized_profit_sol': max(-self.p99_clamps['realized_profit_sol'], min(self.p99_clamps['realized_profit_sol'], float(holding_item.get('realized_profit_sol', 0.0) or 0.0))),
                     'history_transfer_in': float(holding_item.get('history_transfer_in', 0.0) or 0.0),
                     'history_transfer_out': float(holding_item.get('history_transfer_out', 0.0) or 0.0),
                     'avarage_trade_gap_seconds': float(holding_item.get('avarage_trade_gap_seconds', 0.0) or 0.0),
             compact_profile = {'wallet_address': addr}
             for key in expected_profile_keys:
                 compact_profile[key] = float(profile_data.get(key, 0.0) or 0.0)
             compact_social = {
                 'has_pf_profile': bool(social_data.get('has_pf_profile', False)),
                 'mev_protection': 1 if trade.get('mev_protection', 0) > 0 else 0,
                 'token_amount_pct_of_holding': token_pct_hold,
                 'quote_amount_pct_of_holding': quote_pct_hold,
+                'slippage': min(self.p99_clamps['slippage'], float(trade.get('slippage', 0.0) or 0.0)),
                 'token_amount_pct_to_total_supply': token_pct_supply,
                 'success': is_success,
                 'is_bundle': trade.get('is_bundle', False),

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87f6e823bed45b3e399d6fe2ab46f3297d80a623e5a41cca785a60c5a7db067d
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:1d757990a0158118444be61f3d944dfb125237928809b4568ac209ab260f032e
 size 1660

inference.py CHANGED Viewed

@@ -29,7 +29,7 @@ if __name__ == "__main__":
     print("--- Oracle Inference Script (Full Pipeline Test) ---")
     # --- 1. Define Configs ---
-    OHLC_SEQ_LEN = 60
     print(f"Using {vocab.NUM_EVENT_TYPES} event types from vocabulary.")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

     print("--- Oracle Inference Script (Full Pipeline Test) ---")
     # --- 1. Define Configs ---
+    OHLC_SEQ_LEN = 300
     print(f"Using {vocab.NUM_EVENT_TYPES} event types from vocabulary.")
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c605ecab2de1c8c8442dda85ada5345b9d6ba43aae4095130f1d92ce6261c127
-size 44400

 version https://git-lfs.github.com/spec/v1
+oid sha256:656c6818f224b26869b5d0ae10f6b522ff7eb5c7b1b3aeb59b34c3db218338a9
+size 11360

models/wallet_encoder.py CHANGED Viewed

@@ -47,14 +47,14 @@ class WalletEncoder(nn.Module):
         self.mmp_dim = self.encoder.embedding_dim # 1152
         # === 1. Profile Encoder (FIXED) ===
-        # 1 age + 5 deployer_stats + 1 balance + 4 lifetime_counts +
-        # 3 lifetime_trading + 12 1d_stats + 12 7d_stats = 38
-        self.profile_numerical_features = 38
         self.profile_num_norm = nn.LayerNorm(self.profile_numerical_features)
         # FIXED: Input dim no longer has bool embed or deployed tokens embed
-        profile_mlp_in_dim = self.profile_numerical_features # 38
         self.profile_encoder_mlp = self._build_mlp(profile_mlp_in_dim, d_model)
@@ -152,17 +152,15 @@ class WalletEncoder(nn.Module):
     def _encode_profile_batch(self, profile_rows, device):
         batch_size = len(profile_rows)
-        # FIXED: 38 numerical features
         num_tensor = torch.zeros(batch_size, self.profile_numerical_features, device=device, dtype=self.dtype)
         # bool_tensor removed
         # time_tensor removed
         for i, row in enumerate(profile_rows):
-            # A: Numerical (FIXED: 38 features, MUST be present)
             num_data = [
-                # 1. Age
-                row.get('age', 0.0),
-                # 2. Deployed Token Aggregates (5)
                 row.get('deployed_tokens_count', 0.0),
                 row.get('deployed_tokens_migrated_pct', 0.0),
                 row.get('deployed_tokens_avg_lifetime_sec', 0.0),
@@ -195,7 +193,7 @@ class WalletEncoder(nn.Module):
             # C: Booleans and deployed_tokens lists are GONE
-        # Log-normalize all numerical features (age, stats, etc.)
         num_embed = self.profile_num_norm(self._safe_signed_log(num_tensor))
         # The profile fused tensor is now just the numerical embeddings

         self.mmp_dim = self.encoder.embedding_dim # 1152
         # === 1. Profile Encoder (FIXED) ===
+        # 5 deployer_stats + 1 balance + 4 lifetime_counts +
+        # 3 lifetime_trading + 12 1d_stats + 12 7d_stats = 37
+        self.profile_numerical_features = 37
         self.profile_num_norm = nn.LayerNorm(self.profile_numerical_features)
         # FIXED: Input dim no longer has bool embed or deployed tokens embed
+        profile_mlp_in_dim = self.profile_numerical_features # 37
         self.profile_encoder_mlp = self._build_mlp(profile_mlp_in_dim, d_model)
     def _encode_profile_batch(self, profile_rows, device):
         batch_size = len(profile_rows)
+        # FIXED: 37 numerical features
         num_tensor = torch.zeros(batch_size, self.profile_numerical_features, device=device, dtype=self.dtype)
         # bool_tensor removed
         # time_tensor removed
         for i, row in enumerate(profile_rows):
+            # A: Numerical (FIXED: 37 features, MUST be present)
             num_data = [
+                # 1. Deployed Token Aggregates (5)
                 row.get('deployed_tokens_count', 0.0),
                 row.get('deployed_tokens_migrated_pct', 0.0),
                 row.get('deployed_tokens_avg_lifetime_sec', 0.0),
             # C: Booleans and deployed_tokens lists are GONE
+        # Log-normalize all numerical features (stats, etc.)
         num_embed = self.profile_num_norm(self._safe_signed_log(num_tensor))
         # The profile fused tensor is now just the numerical embeddings

sample_DYtPmhyxPDbMEdVP_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/analyze_distribution.py CHANGED Viewed

@@ -27,6 +27,55 @@ def get_client():
         database=CLICKHOUSE_DATABASE
     )
 def fetch_all_metrics(client):
     """
     Fetches all needed metrics for all tokens in a single query.

         database=CLICKHOUSE_DATABASE
     )
+def compute_p99_clamps(client):
+    """
+    Computes P99 percentile clamp values from ClickHouse for fields prone to
+    garbage outliers. These values replace hardcoded clamps in data_loader.py.
+    Returns a dict of {field_name: p99_value}.
+    """
+    print("   -> Computing P99 clamp values from trades table...")
+    trade_query = """
+    SELECT
+        quantile(0.99)(abs(slippage)) AS p99_slippage,
+        quantile(0.99)(priority_fee) AS p99_priority_fee,
+        quantile(0.99)(total_usd) AS p99_total_usd
+    FROM trades
+    WHERE success = 1
+    """
+    trade_row = client.execute(trade_query)
+    print("   -> Computing P99 clamp values from wallet_holdings table...")
+    holdings_query = """
+    SELECT
+        quantile(0.99)(history_bought_cost_sol) AS p99_bought_cost_sol,
+        quantile(0.99)(abs(realized_profit_sol)) AS p99_realized_profit_sol
+    FROM wallet_holdings
+    """
+    holdings_row = client.execute(holdings_query)
+    clamps = {
+        # Defaults as fallback if queries return nothing
+        'slippage': 1.0,
+        'priority_fee': 0.1,
+        'total_usd': 100000.0,
+        'history_bought_cost_sol': 30.0,
+        'realized_profit_sol': 150.0,
+    }
+    if trade_row and trade_row[0]:
+        r = trade_row[0]
+        clamps['slippage'] = max(float(r[0]), 0.01)
+        clamps['priority_fee'] = max(float(r[1]), 1e-9)
+        clamps['total_usd'] = max(float(r[2]), 1.0)
+    if holdings_row and holdings_row[0]:
+        r = holdings_row[0]
+        clamps['history_bought_cost_sol'] = max(float(r[0]), 0.01)
+        clamps['realized_profit_sol'] = max(float(r[1]), 0.01)
+    print(f"   -> P99 Clamps: {clamps}")
+    return clamps
 def fetch_all_metrics(client):
     """
     Fetches all needed metrics for all tokens in a single query.

scripts/cache_dataset.py CHANGED Viewed

@@ -22,7 +22,7 @@ logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from scripts.analyze_distribution import get_return_class_map
 from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
 from clickhouse_driver import Client as ClickHouseClient
@@ -65,7 +65,8 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
-        max_seq_len=dataset_config['max_seq_len']
     )
     _worker_dataset.sampled_mints = dataset_config['sampled_mints']
     _worker_return_class_map = return_class_map
@@ -179,11 +180,14 @@ def main():
         return_class_map, _ = get_return_class_map(clickhouse_client)
         print(f"INFO: Loaded {len(return_class_map)} classified tokens.")
         print("INFO: Fetching Quality Scores...")
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
-        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, ohlc_stats_path=args.ohlc_stats_path, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
@@ -219,7 +223,7 @@ def main():
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
-        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'ohlc_stats_path': args.ohlc_stats_path, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
         # Build tasks with class-aware multi-sampling for balanced cache
         import random

 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from scripts.analyze_distribution import get_return_class_map, compute_p99_clamps
 from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
 from clickhouse_driver import Client as ClickHouseClient
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
+        max_seq_len=dataset_config['max_seq_len'],
+        p99_clamps=dataset_config.get('p99_clamps')
     )
     _worker_dataset.sampled_mints = dataset_config['sampled_mints']
     _worker_return_class_map = return_class_map
         return_class_map, _ = get_return_class_map(clickhouse_client)
         print(f"INFO: Loaded {len(return_class_map)} classified tokens.")
+        print("INFO: Computing P99 clamp values...")
+        p99_clamps = compute_p99_clamps(clickhouse_client)
         print("INFO: Fetching Quality Scores...")
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, ohlc_stats_path=args.ohlc_stats_path, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length, p99_clamps=p99_clamps)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'ohlc_stats_path': args.ohlc_stats_path, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints, 'p99_clamps': p99_clamps}
         # Build tasks with class-aware multi-sampling for balanced cache
         import random