Spaces:

gionuibk
/

NautilusTrainer

Sleeping

App Files Files Community

gionuibk commited on Dec 12, 2025

Commit

4b94e4d

verified ·

1 Parent(s): db75a76

Upload streaming_loader.py with huggingface_hub

Browse files

Files changed (1) hide show

streaming_loader.py +9 -4

streaming_loader.py CHANGED Viewed

@@ -19,7 +19,8 @@ class StreamingDataLoader:
                  model_type: str = "deeplob",
                  batch_size: int = 32,
                  chunk_size: int = 500,  # Reduced to ensure frequent yields
-                 buffer_size: int = 200): # Reduced buffer
         """
         Args:
             repo_id: HF Dataset ID
@@ -27,12 +28,14 @@ class StreamingDataLoader:
             batch_size: Training batch size
             chunk_size: Rows per processing chunk
             buffer_size: Overlap size to maintain rolling stats continuity
         """
         self.repo_id = repo_id
         self.model_type = model_type
         self.batch_size = batch_size
         self.chunk_size = chunk_size
         self.buffer_size = buffer_size
         self.processor = AlphaDataProcessor()
@@ -40,7 +43,7 @@ class StreamingDataLoader:
         """
         Yields batches of (X, y) tensors from the stream.
         """
-        print(f"📡 Connecting to HF Dataset Stream: {self.repo_id}")
         token = os.environ.get("HF_TOKEN")
         try:
@@ -70,16 +73,18 @@ class StreamingDataLoader:
                     f for f in files
                     if (f.startswith("data/bar/") or f.startswith("data/candles/"))
                     and f.endswith(".parquet")
                 ]
-                print(f"📂 Found {len(target_files)} Bar/Candle files for LSTM.")
             else:
                 # Use L2 Snapshots for DeepLOB/TRM (Support both v1 'order_book_snapshot' and v2 'l2book')
                 target_files = [
                     f for f in files
                     if ("order_book_snapshot" in f or "l2book" in f)
                     and f.endswith(".parquet")
                 ]
-                print(f"📂 Found {len(target_files)} Snapshot/L2Book files for {self.model_type}.")
             if not target_files:
                 raise RuntimeError(f"No valid training files found for {self.model_type} in {self.repo_id}")

                  model_type: str = "deeplob",
                  batch_size: int = 32,
                  chunk_size: int = 500,  # Reduced to ensure frequent yields
+                 buffer_size: int = 200, # Reduced buffer
+                 coin: str = "ETH"):     # Filter by Symbol (CRITICAL FIX)
         """
         Args:
             repo_id: HF Dataset ID
             batch_size: Training batch size
             chunk_size: Rows per processing chunk
             buffer_size: Overlap size to maintain rolling stats continuity
+            coin: Symbol to filter (e.g. "ETH", "BTC")
         """
         self.repo_id = repo_id
         self.model_type = model_type
         self.batch_size = batch_size
         self.chunk_size = chunk_size
         self.buffer_size = buffer_size
+        self.coin = coin
         self.processor = AlphaDataProcessor()
         """
         Yields batches of (X, y) tensors from the stream.
         """
+        print(f"📡 Connecting to HF Dataset Stream: {self.repo_id} (Filter: {self.coin})")
         token = os.environ.get("HF_TOKEN")
         try:
                     f for f in files
                     if (f.startswith("data/bar/") or f.startswith("data/candles/"))
                     and f.endswith(".parquet")
+                    and self.coin in f  # STRICT FILTER
                 ]
+                print(f"📂 Found {len(target_files)} Bar/Candle files for LSTM (Symbol: {self.coin}).")
             else:
                 # Use L2 Snapshots for DeepLOB/TRM (Support both v1 'order_book_snapshot' and v2 'l2book')
                 target_files = [
                     f for f in files
                     if ("order_book_snapshot" in f or "l2book" in f)
                     and f.endswith(".parquet")
+                    and self.coin in f  # STRICT FILTER
                 ]
+                print(f"📂 Found {len(target_files)} Snapshot/L2Book files for {self.model_type} (Symbol: {self.coin}).")
             if not target_files:
                 raise RuntimeError(f"No valid training files found for {self.model_type} in {self.repo_id}")