zirobtc commited on
Commit
1167296
·
1 Parent(s): 0b5fec0

Upload folder using huggingface_hub

Browse files
Files changed (8) hide show
  1. .gitignore +5 -1
  2. data/data_collator.py +7 -0
  3. data/data_loader.py +21 -18
  4. last_prompt.txt +155 -0
  5. log.log +2 -2
  6. requirements.txt +3 -2
  7. train.py +3 -2
  8. train.sh +1 -1
.gitignore CHANGED
@@ -7,4 +7,8 @@ runs/
7
 
8
  data/pump_fun
9
 
10
- .env
 
 
 
 
 
7
 
8
  data/pump_fun
9
 
10
+ .env
11
+
12
+ data/cache
13
+ .tmp/
14
+ .cache/
data/data_collator.py CHANGED
@@ -399,6 +399,13 @@ class MemecoinCollator:
399
 
400
  # Loop through sequences to populate tensors and collect chart events
401
  for i, seq in enumerate(all_event_sequences):
 
 
 
 
 
 
 
402
  seq_len = len(seq)
403
  if seq_len == 0: continue
404
  attention_mask[i, :seq_len] = 1
 
399
 
400
  # Loop through sequences to populate tensors and collect chart events
401
  for i, seq in enumerate(all_event_sequences):
402
+ # --- LOGGING CONTEXT (First item only) ---
403
+ if i == 0:
404
+ context_names = [e.get('event_type', 'Unknown') for e in seq]
405
+ print("\n[DataCollator] Context Preview (Event Sequence Names):")
406
+ print(context_names)
407
+ print(f"[DataCollator] Sequence Length: {len(context_names)}\n")
408
+
409
  seq_len = len(seq)
410
  if seq_len == 0: continue
411
  attention_mask[i, :seq_len] = 1
data/data_loader.py CHANGED
@@ -693,28 +693,31 @@ class OracleDataset(Dataset):
693
  image_resp.raise_for_status()
694
  image = Image.open(BytesIO(image_resp.content))
695
  except (requests.RequestException, ValueError, IOError) as e:
696
- raise RuntimeError(f"FATAL: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
 
 
697
 
698
 
699
  # --- FIXED: Check for valid metadata before adding to pooler ---
700
  token_name = data.get('name') if data.get('name') and data.get('name').strip() else None
701
  token_symbol = data.get('symbol') if data.get('symbol') and data.get('symbol').strip() else None
702
 
703
- # --- IMAGE IS A FUCKING MUST
704
- # --- FIXED: Correctly handle invalid secondary tokens without aborting the whole process ---
705
- if not token_name or not token_symbol or not image:
706
- if not token_name: reason = "name"
707
- elif not token_symbol: reason = "symbol"
708
- else: reason = "image (fetch failed)"
709
-
710
- print(f"WARN: Token {addr} is missing essential metadata ('{reason}'). This token will be skipped.")
711
-
712
- # If this function was called with only one token, it's the main token.
713
- # If the main token is invalid, the whole sample is invalid, so return None.
714
- if len(token_addresses) == 1:
715
- return None
716
- # Otherwise, it's a secondary token. Skip it and continue with the others.
717
- continue
 
718
 
719
  # --- NEW: Add is_vanity feature based on the token address ---
720
  data['is_vanity'] = addr.lower().endswith("pump")
@@ -788,7 +791,7 @@ class OracleDataset(Dataset):
788
  raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
789
  filepath = self.cached_files[idx]
790
  try:
791
- raw_data = torch.load(filepath, map_location='cpu')
792
  except Exception as e:
793
  print(f"ERROR: Could not load cached item {filepath}: {e}")
794
  return None
@@ -988,7 +991,7 @@ class OracleDataset(Dataset):
988
  max_horizon_seconds=self.max_cache_horizon_seconds,
989
  include_wallet_data=False,
990
  include_graph=False,
991
- min_trades=25
992
  )
993
  if raw_data is None:
994
  return None
 
693
  image_resp.raise_for_status()
694
  image = Image.open(BytesIO(image_resp.content))
695
  except (requests.RequestException, ValueError, IOError) as e:
696
+
697
+ print(f"WARN: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
698
+ image = None
699
 
700
 
701
  # --- FIXED: Check for valid metadata before adding to pooler ---
702
  token_name = data.get('name') if data.get('name') and data.get('name').strip() else None
703
  token_symbol = data.get('symbol') if data.get('symbol') and data.get('symbol').strip() else None
704
 
705
+ # --- RELAXED: Allow missing metadata (pass None -> Zero Embedding) ---
706
+ # The collator's EmbeddingPooler and logic handles non-str/non-image items
707
+ # by skipping encoding and leaving their embedding vector as zeros.
708
+ if not token_name:
709
+ token_name = None #(Zeroed)
710
+ if not token_symbol:
711
+ token_symbol = None #(Zeroed)
712
+
713
+ # If image failed or missing, pass None
714
+ if not image:
715
+ image = None #(Zeroed)
716
+
717
+ # Only skip if we somehow have NO address (should technically fail earlier)
718
+ if not addr:
719
+ print(f"WARN: Token {addr} has no address?? Skipping.")
720
+ continue
721
 
722
  # --- NEW: Add is_vanity feature based on the token address ---
723
  data['is_vanity'] = addr.lower().endswith("pump")
 
791
  raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
792
  filepath = self.cached_files[idx]
793
  try:
794
+ raw_data = torch.load(filepath, map_location='cpu', weights_only=False)
795
  except Exception as e:
796
  print(f"ERROR: Could not load cached item {filepath}: {e}")
797
  return None
 
991
  max_horizon_seconds=self.max_cache_horizon_seconds,
992
  include_wallet_data=False,
993
  include_graph=False,
994
+ min_trades=50
995
  )
996
  if raw_data is None:
997
  return None
last_prompt.txt ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ can you log the model context?
2
+ but not the full data just an array of the events names not unique names so rather than dump the full event of the data dump only the name so i can see how context look like
3
+
4
+
5
+ can you log the model context? but not the full data just an array of the events names not unique names so rather than dump the full event of the data dump only the name so i can see how context look like
6
+
7
+
8
+ I have analyzed scripts/cache_dataset.py, train.py, and the underlying data modules.
9
+
10
+ Here is the breakdown of the data sources and their cachability.
11
+
12
+ 1. ClickHouse Tables & Neo4j Data
13
+ These scripts utilize the following data sources:
14
+
15
+ ClickHouse Tables:
16
+
17
+ mints: Token creation and initial metadata.
18
+ trades: Buy/Sell transactions.
19
+ transfers: Token transfers between wallets.
20
+ tokens: Token metadata (name, symbol, uri).
21
+ token_metrics: Historical token stats (e.g., ATH price).
22
+ pool_creations: Liquidity pool initialization.
23
+ liquidity: Liquidity addition/removal events.
24
+ fee_collections: Protocol fee events.
25
+ burns: Token burn events.
26
+ supply_locks: Supply locking events.
27
+ migrations: Pump.fun to Raydium migration events.
28
+ wallet_profiles: Static/Base wallet information.
29
+ wallet_profile_metrics: Time-series metrics for wallets (PnL, winrate, etc.).
30
+ wallet_socials: Social media handles for wallets.
31
+ wallet_holdings: Token balances for wallets.
32
+ Neo4j Graph:
33
+
34
+ Nodes: Wallets and Tokens.
35
+ Relationships: Transactional links (e.g., TRANSFERRED, TRADED) with properties like timestamp. The code fetches links up to 1 degree of separation.
36
+ 2. Events & Cachability Analysis
37
+ We can categorize the data into Immutable Events (Fully Cachable) and State-Dependent Data (Requires Point-in-Time Calculation).
38
+
39
+ ✅ Fully Cachable (Immutable Events)
40
+ These are append-only logs. Once an event happens, it doesn't change. You can cache these as simple chronological sequences.
41
+
42
+ Trades: Simple buy/sell executions (trades table).
43
+ Transfers: Movement of tokens (transfers table).
44
+ Pool Creations: Setup of new trading pools (pool_creations table).
45
+ Liquidity Changes: Adds/removes from pools (liquidity table).
46
+ Fee Collections: Protocol revenue events (fee_collections table).
47
+ Burns: Supply reduction events (burns table).
48
+ Supply Locks: Freezing supply (supply_locks table).
49
+ Migrations: Liquidity migration events (migrations table).
50
+ Minting: The initial token generation (mints table).
51
+ ⚠️ Not Fully Cachable (State-Dependent)
52
+ These represent the state of the system at a specific point in time. Because they change over time (mutable), they cannot be simply cached as a static list without a reference timestamp (T_cutoff).
53
+
54
+ Holder Snapshots:
55
+ Dependency: wallet_holdings.
56
+ Why: A snapshot requires calculating who held the token and how much at a specific second. The code runs a windowed query to find the latest balance WHERE updated_at <= T_cutoff. A simpler event stream isn't enough; you need the aggregate state.
57
+ Wallet Profiles & Metrics:
58
+ Dependency: wallet_profiles, wallet_profile_metrics.
59
+ Why: A wallet's PnL, winrate, and "Smart Trader" status change over time. The model fetches the latest metrics record available before the cutoff time.
60
+ Wallet Holdings:
61
+ Dependency: wallet_holdings.
62
+ Why: Used to determine what other tokens a wallet is holding at the time of the trade.
63
+ Graph State:
64
+ Dependency: Neo4j.
65
+ Why: While links are technically events, the code constructs a local graph centered around active wallets at T_cutoff. The structure of the graph is state-dependent based on the window.
66
+ no i asked to tell me wich tables are queries during cache and wich are queired on fly
67
+
68
+ Thought for 9s
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+ Analyze Data Dependencies
78
+ I have completed the code tracing. I confirmed that
79
+ cache_dataset.py
80
+ fetches only immutable event streams (Trades, Transfers, etc.) while
81
+ train.py
82
+ fetches all state-dependent data (Profiles, Holdings, Graph, Snapshots) at the specific T_cutoff for each sample.
83
+
84
+ Progress Updates
85
+ 1
86
+ Checking data loader implementation for cache vs train logic
87
+
88
+ 2
89
+ Reporting final analysis to user
90
+
91
+ Yes, scripts/cache_dataset.py and train.py perform very different queries.
92
+
93
+ The caching script focuses on saving the heavy, immutable event logs, while the training loop queries the "state of the world" (Holdings, Profiles, Graph) at the specific random moment (T_cutoff) chosen for each sample.
94
+
95
+ 1. Cached Data (Queried by scripts/cache_dataset.py)
96
+ These tables are queried once and saved to disk. They contain immutable events—things that happen and never change.
97
+
98
+ trades: The bulk of the data. All buy/sell events.
99
+ transfers: Token movement events.
100
+ pool_creations: When pools were made.
101
+ liquidity: Adds/removes from pools.
102
+ fee_collections: Protocol fees.
103
+ burns: Token burns.
104
+ supply_locks: Supply freezing.
105
+ migrations: Pump.fun $\to$ Raydium migrations.
106
+ mints: The initial token entries.
107
+ Note: The script explicitly sets include_wallet_data=False and include_graph=False when calling fetch_raw_token_data.
108
+
109
+ 2. On-the-Fly Data (Queried by train.py)
110
+ These are queried every time a sample is loaded in the dataloader. They represent the mutable state of the blockchain at the specific T_cutoff used for training.
111
+
112
+ wallet_holdings (Heavy Query):
113
+ Used to find the top holders at that exact second.
114
+ Used to see what other tokens the traders were holding.
115
+ Query: Windowed search for current_balance > 0 and updated_at <= T_cutoff.
116
+ wallet_profiles & wallet_profile_metrics:
117
+ Used to get the PnL, winrate, and "Smart Trader" status of wallets involved in the trade.
118
+ Query: Fetches the latest profile update before T_cutoff.
119
+ wallet_socials:
120
+ Used to check if a wallet is a known KOL (Key Opinion Leader) or has Twitter/Telegram linked.
121
+ tokens & token_metrics:
122
+ Used to get the token name, symbol, URI, and ATH price as they were at T_cutoff.
123
+ Query: Fetches deployed token stats for the wallets.
124
+ Neo4j Graph (TRANSFERRED, etc.):
125
+ Used to build the 1-degree graph of connections between the active wallets.
126
+ Query: "Who did these wallets interact with before T_cutoff?"
127
+ Snapshot Metrics:
128
+ The dataloader runs a loop to generate OnChain_Snapshot events.
129
+ Queries per snapshot interval: fetch_total_holders_count_for_token and fetch_token_holders_for_snapshot.
130
+ Summary Table
131
+ Data Source Accessed By Why?
132
+ Event Streams (Trades, Transfers, Burns...) Cache Script Information is static (append-only). Safe to save.
133
+ Wallet Profiles (PnL, Winrate) Train (On-fly) A wallet's PnL changes every day. We need the value at T_cutoff.
134
+ Holdings (Balances) Train (On-fly) Balances change constantly. We need the snapshot at T_cutoff.
135
+ Graph (Neo4j) Train (On-fly) The network of connections evolves over time.
136
+ Token Metadata Train (On-fly) To handle potential metadata updates (rare, but accurate).
137
+ Recent actions
138
+ Review Changes
139
+
140
+
141
+
142
+
143
+
144
+ Gemini 3 Pro (High)
145
+
146
+ Model
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
log.log CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49fbbad7f8b64bdc8c2c12b853433e6ec1e67615288a42112086254d96e0acf5
3
- size 3985
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb6fc43f8ae6467768fb090cfdda9ef48e68d361874317db93e5eee126539989
3
+ size 143685
requirements.txt CHANGED
@@ -7,9 +7,10 @@ datasets
7
  transformers
8
  huggingface_hub
9
  decord
10
- #pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
11
  clickhouse-driver
12
  neo4j
13
  tensorboard
14
  accelerate
15
- python-dotenv
 
 
 
7
  transformers
8
  huggingface_hub
9
  decord
 
10
  clickhouse-driver
11
  neo4j
12
  tensorboard
13
  accelerate
14
+ python-dotenv
15
+ torch_geometric
16
+ sentencepiece
train.py CHANGED
@@ -231,7 +231,7 @@ def main() -> None:
231
  port=int(args.clickhouse_port)
232
  )
233
 
234
- neo4j_auth = None
235
  if args.neo4j_user is not None:
236
  neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
237
  neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
@@ -244,7 +244,8 @@ def main() -> None:
244
  quantiles=quantiles,
245
  max_samples=args.max_samples,
246
  ohlc_stats_path=args.ohlc_stats_path,
247
- t_cutoff_seconds=int(args.t_cutoff_seconds)
 
248
  )
249
 
250
  if len(dataset) == 0:
 
231
  port=int(args.clickhouse_port)
232
  )
233
 
234
+ neo4j_auth = ("neo4j", "neo4j123")
235
  if args.neo4j_user is not None:
236
  neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
237
  neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
 
244
  quantiles=quantiles,
245
  max_samples=args.max_samples,
246
  ohlc_stats_path=args.ohlc_stats_path,
247
+ t_cutoff_seconds=int(args.t_cutoff_seconds),
248
+ cache_dir="/workspace/apollo/data/cache"
249
  )
250
 
251
  if len(dataset) == 0:
train.sh CHANGED
@@ -1,5 +1,5 @@
1
  accelerate launch train.py \
2
- --epochs 1 \
3
  --batch_size 1 \
4
  --learning_rate 1e-4 \
5
  --warmup_ratio 0.1 \
 
1
  accelerate launch train.py \
2
+ --epochs 10 \
3
  --batch_size 1 \
4
  --learning_rate 1e-4 \
5
  --warmup_ratio 0.1 \