Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitignore +5 -1
data/data_collator.py +7 -0
data/data_loader.py +21 -18
last_prompt.txt +155 -0
log.log +2 -2
requirements.txt +3 -2
train.py +3 -2
train.sh +1 -1

.gitignore CHANGED Viewed

@@ -7,4 +7,8 @@ runs/
 data/pump_fun
-.env

 data/pump_fun
+.env
+data/cache
+.tmp/
+.cache/

data/data_collator.py CHANGED Viewed

@@ -399,6 +399,13 @@ class MemecoinCollator:
         # Loop through sequences to populate tensors and collect chart events
         for i, seq in enumerate(all_event_sequences):
             seq_len = len(seq)
             if seq_len == 0: continue
             attention_mask[i, :seq_len] = 1

         # Loop through sequences to populate tensors and collect chart events
         for i, seq in enumerate(all_event_sequences):
+            # --- LOGGING CONTEXT (First item only) ---
+            if i == 0:
+                context_names = [e.get('event_type', 'Unknown') for e in seq]
+                print("\n[DataCollator] Context Preview (Event Sequence Names):")
+                print(context_names)
+                print(f"[DataCollator] Sequence Length: {len(context_names)}\n")
             seq_len = len(seq)
             if seq_len == 0: continue
             attention_mask[i, :seq_len] = 1

data/data_loader.py CHANGED Viewed

@@ -693,28 +693,31 @@ class OracleDataset(Dataset):
                             image_resp.raise_for_status()
                             image = Image.open(BytesIO(image_resp.content))
                 except (requests.RequestException, ValueError, IOError) as e:
-                    raise RuntimeError(f"FATAL: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
             # --- FIXED: Check for valid metadata before adding to pooler ---
             token_name = data.get('name') if data.get('name') and data.get('name').strip() else None
             token_symbol = data.get('symbol') if data.get('symbol') and data.get('symbol').strip() else None
-            # --- IMAGE IS A FUCKING MUST
-            # --- FIXED: Correctly handle invalid secondary tokens without aborting the whole process ---
-            if not token_name or not token_symbol or not image:
-                if not token_name: reason = "name"
-                elif not token_symbol: reason = "symbol"
-                else: reason = "image (fetch failed)"
-                print(f"WARN: Token {addr} is missing essential metadata ('{reason}'). This token will be skipped.")
-                # If this function was called with only one token, it's the main token.
-                # If the main token is invalid, the whole sample is invalid, so return None.
-                if len(token_addresses) == 1:
-                    return None
-                # Otherwise, it's a secondary token. Skip it and continue with the others.
-                continue
             # --- NEW: Add is_vanity feature based on the token address ---
             data['is_vanity'] = addr.lower().endswith("pump")
@@ -788,7 +791,7 @@ class OracleDataset(Dataset):
                 raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
             filepath = self.cached_files[idx]
             try:
-                raw_data = torch.load(filepath, map_location='cpu')
             except Exception as e:
                 print(f"ERROR: Could not load cached item {filepath}: {e}")
                 return None
@@ -988,7 +991,7 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=25
         )
         if raw_data is None:
             return None

                             image_resp.raise_for_status()
                             image = Image.open(BytesIO(image_resp.content))
                 except (requests.RequestException, ValueError, IOError) as e:
+                    print(f"WARN: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
+                    image = None
             # --- FIXED: Check for valid metadata before adding to pooler ---
             token_name = data.get('name') if data.get('name') and data.get('name').strip() else None
             token_symbol = data.get('symbol') if data.get('symbol') and data.get('symbol').strip() else None
+            # --- RELAXED: Allow missing metadata (pass None -> Zero Embedding) ---
+            # The collator's EmbeddingPooler and logic handles non-str/non-image items
+            # by skipping encoding and leaving their embedding vector as zeros.
+            if not token_name:
+                token_name = None #(Zeroed)
+            if not token_symbol:
+                token_symbol = None #(Zeroed)
+            # If image failed or missing, pass None
+            if not image:
+                image = None #(Zeroed)
+            # Only skip if we somehow have NO address (should technically fail earlier)
+            if not addr:
+                 print(f"WARN: Token {addr} has no address?? Skipping.")
+                 continue
             # --- NEW: Add is_vanity feature based on the token address ---
             data['is_vanity'] = addr.lower().endswith("pump")
                 raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
             filepath = self.cached_files[idx]
             try:
+                raw_data = torch.load(filepath, map_location='cpu', weights_only=False)
             except Exception as e:
                 print(f"ERROR: Could not load cached item {filepath}: {e}")
                 return None
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=50
         )
         if raw_data is None:
             return None

last_prompt.txt ADDED Viewed

	@@ -0,0 +1,155 @@

+can you log the model context?
+but not the full data just an array of the events names not unique names so rather than dump the full event of the data dump only the name so i can see how context look like
+can you log the model context? but not the full data just an array of the events names not unique names so rather than dump the full event of the data dump only the name so i can see how context look like
+I have analyzed scripts/cache_dataset.py, train.py, and the underlying data modules.
+Here is the breakdown of the data sources and their cachability.
+1. ClickHouse Tables & Neo4j Data
+These scripts utilize the following data sources:
+ClickHouse Tables:
+mints: Token creation and initial metadata.
+trades: Buy/Sell transactions.
+transfers: Token transfers between wallets.
+tokens: Token metadata (name, symbol, uri).
+token_metrics: Historical token stats (e.g., ATH price).
+pool_creations: Liquidity pool initialization.
+liquidity: Liquidity addition/removal events.
+fee_collections: Protocol fee events.
+burns: Token burn events.
+supply_locks: Supply locking events.
+migrations: Pump.fun to Raydium migration events.
+wallet_profiles: Static/Base wallet information.
+wallet_profile_metrics: Time-series metrics for wallets (PnL, winrate, etc.).
+wallet_socials: Social media handles for wallets.
+wallet_holdings: Token balances for wallets.
+Neo4j Graph:
+Nodes: Wallets and Tokens.
+Relationships: Transactional links (e.g., TRANSFERRED, TRADED) with properties like timestamp. The code fetches links up to 1 degree of separation.
+2. Events & Cachability Analysis
+We can categorize the data into Immutable Events (Fully Cachable) and State-Dependent Data (Requires Point-in-Time Calculation).
+✅ Fully Cachable (Immutable Events)
+These are append-only logs. Once an event happens, it doesn't change. You can cache these as simple chronological sequences.
+Trades: Simple buy/sell executions (trades table).
+Transfers: Movement of tokens (transfers table).
+Pool Creations: Setup of new trading pools (pool_creations table).
+Liquidity Changes: Adds/removes from pools (liquidity table).
+Fee Collections: Protocol revenue events (fee_collections table).
+Burns: Supply reduction events (burns table).
+Supply Locks: Freezing supply (supply_locks table).
+Migrations: Liquidity migration events (migrations table).
+Minting: The initial token generation (mints table).
+⚠️ Not Fully Cachable (State-Dependent)
+These represent the state of the system at a specific point in time. Because they change over time (mutable), they cannot be simply cached as a static list without a reference timestamp (T_cutoff).
+Holder Snapshots:
+Dependency: wallet_holdings.
+Why: A snapshot requires calculating who held the token and how much at a specific second. The code runs a windowed query to find the latest balance WHERE updated_at <= T_cutoff. A simpler event stream isn't enough; you need the aggregate state.
+Wallet Profiles & Metrics:
+Dependency: wallet_profiles, wallet_profile_metrics.
+Why: A wallet's PnL, winrate, and "Smart Trader" status change over time. The model fetches the latest metrics record available before the cutoff time.
+Wallet Holdings:
+Dependency: wallet_holdings.
+Why: Used to determine what other tokens a wallet is holding at the time of the trade.
+Graph State:
+Dependency: Neo4j.
+Why: While links are technically events, the code constructs a local graph centered around active wallets at T_cutoff. The structure of the graph is state-dependent based on the window.
+no i asked to tell me wich tables are queries during cache and wich are queired on fly
+Thought for 9s
+Analyze Data Dependencies
+I have completed the code tracing. I confirmed that
+cache_dataset.py
+ fetches only immutable event streams (Trades, Transfers, etc.) while
+train.py
+ fetches all state-dependent data (Profiles, Holdings, Graph, Snapshots) at the specific T_cutoff for each sample.
+Progress Updates
+1
+Checking data loader implementation for cache vs train logic
+2
+Reporting final analysis to user
+Yes, scripts/cache_dataset.py and train.py perform very different queries.
+The caching script focuses on saving the heavy, immutable event logs, while the training loop queries the "state of the world" (Holdings, Profiles, Graph) at the specific random moment (T_cutoff) chosen for each sample.
+1. Cached Data (Queried by scripts/cache_dataset.py)
+These tables are queried once and saved to disk. They contain immutable events—things that happen and never change.
+trades: The bulk of the data. All buy/sell events.
+transfers: Token movement events.
+pool_creations: When pools were made.
+liquidity: Adds/removes from pools.
+fee_collections: Protocol fees.
+burns: Token burns.
+supply_locks: Supply freezing.
+migrations: Pump.fun $\to$ Raydium migrations.
+mints: The initial token entries.
+Note: The script explicitly sets include_wallet_data=False and include_graph=False when calling fetch_raw_token_data.
+2. On-the-Fly Data (Queried by train.py)
+These are queried every time a sample is loaded in the dataloader. They represent the mutable state of the blockchain at the specific T_cutoff used for training.
+wallet_holdings (Heavy Query):
+Used to find the top holders at that exact second.
+Used to see what other tokens the traders were holding.
+Query: Windowed search for current_balance > 0 and updated_at <= T_cutoff.
+wallet_profiles & wallet_profile_metrics:
+Used to get the PnL, winrate, and "Smart Trader" status of wallets involved in the trade.
+Query: Fetches the latest profile update before T_cutoff.
+wallet_socials:
+Used to check if a wallet is a known KOL (Key Opinion Leader) or has Twitter/Telegram linked.
+tokens & token_metrics:
+Used to get the token name, symbol, URI, and ATH price as they were at T_cutoff.
+Query: Fetches deployed token stats for the wallets.
+Neo4j Graph (TRANSFERRED, etc.):
+Used to build the 1-degree graph of connections between the active wallets.
+Query: "Who did these wallets interact with before T_cutoff?"
+Snapshot Metrics:
+The dataloader runs a loop to generate OnChain_Snapshot events.
+Queries per snapshot interval: fetch_total_holders_count_for_token and fetch_token_holders_for_snapshot.
+Summary Table
+Data Source	Accessed By	Why?
+Event Streams (Trades, Transfers, Burns...)	Cache Script	Information is static (append-only). Safe to save.
+Wallet Profiles (PnL, Winrate)	Train (On-fly)	A wallet's PnL changes every day. We need the value at T_cutoff.
+Holdings (Balances)	Train (On-fly)	Balances change constantly. We need the snapshot at T_cutoff.
+Graph (Neo4j)	Train (On-fly)	The network of connections evolves over time.
+Token Metadata	Train (On-fly)	To handle potential metadata updates (rare, but accurate).
+Recent actions
+Review Changes
+Gemini 3 Pro (High)
+Model

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:49fbbad7f8b64bdc8c2c12b853433e6ec1e67615288a42112086254d96e0acf5
-size 3985

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb6fc43f8ae6467768fb090cfdda9ef48e68d361874317db93e5eee126539989
+size 143685

requirements.txt CHANGED Viewed

@@ -7,9 +7,10 @@ datasets
 transformers
 huggingface_hub
 decord
-#pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu126
 clickhouse-driver
 neo4j
 tensorboard
 accelerate
-python-dotenv

 transformers
 huggingface_hub
 decord
 clickhouse-driver
 neo4j
 tensorboard
 accelerate
+python-dotenv
+torch_geometric
+sentencepiece

train.py CHANGED Viewed

@@ -231,7 +231,7 @@ def main() -> None:
         port=int(args.clickhouse_port)
     )
-    neo4j_auth = None
     if args.neo4j_user is not None:
         neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
@@ -244,7 +244,8 @@ def main() -> None:
         quantiles=quantiles,
         max_samples=args.max_samples,
         ohlc_stats_path=args.ohlc_stats_path,
-        t_cutoff_seconds=int(args.t_cutoff_seconds)
     )
     if len(dataset) == 0:

         port=int(args.clickhouse_port)
     )
+    neo4j_auth = ("neo4j", "neo4j123")
     if args.neo4j_user is not None:
         neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
         quantiles=quantiles,
         max_samples=args.max_samples,
         ohlc_stats_path=args.ohlc_stats_path,
+        t_cutoff_seconds=int(args.t_cutoff_seconds),
+        cache_dir="/workspace/apollo/data/cache"
     )
     if len(dataset) == 0:

train.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 accelerate launch train.py \
-  --epochs 1 \
   --batch_size 1 \
   --learning_rate 1e-4 \
   --warmup_ratio 0.1 \

 accelerate launch train.py \
+  --epochs 10 \
   --batch_size 1 \
   --learning_rate 1e-4 \
   --warmup_ratio 0.1 \