algorythmtechnologies
/

Supernova25million

English

Model card Files Files and versions

xet

Community

algorythmtechnologies commited on Sep 21, 2025

Commit

a55cadf

verified ·

1 Parent(s): 4e8d206

Update supernova/data.py

Browse files

Files changed (1) hide show

supernova/data.py +31 -18

supernova/data.py CHANGED Viewed

@@ -8,7 +8,6 @@ from datasets import load_dataset
 from transformers import PreTrainedTokenizerBase
 import yaml
 @dataclass
 class DataSource:
     name: str
@@ -19,7 +18,6 @@ class DataSource:
     weight: int = 1
     streaming: bool = True
 def load_sources_from_yaml(path: str) -> List[DataSource]:
     with open(path, "r", encoding="utf-8") as f:
         cfg = yaml.safe_load(f)
@@ -37,7 +35,6 @@ def load_sources_from_yaml(path: str) -> List[DataSource]:
     assert len(srcs) > 0, "No data sources configured"
     return srcs
 def build_streams(sources: List[DataSource]) -> List[Iterator[Dict]]:
     iters = []
     for s in sources:
@@ -45,7 +42,6 @@ def build_streams(sources: List[DataSource]) -> List[Iterator[Dict]]:
         iters.append(iter(ds))
     return iters
 def weighted_choice(weights: List[int]) -> int:
     total = sum(weights)
     r = random.randint(1, total)
@@ -56,7 +52,6 @@ def weighted_choice(weights: List[int]) -> int:
             return i
     return len(weights) - 1
 class TokenChunkDataset(IterableDataset):
     def __init__(
         self,
@@ -76,22 +71,35 @@ class TokenChunkDataset(IterableDataset):
         iters = build_streams(self.sources)
         while True:
             i = weighted_choice(self.weights)
-          def __len__(self):
-    return 1000000  # enables progress bar if you use one
-def _safe_encode(self, text: str) -> list:
-    try:
-        return self.tok.encode(text)
-    except Exception as e:
-        print(f"Encoding error for text: {text[:50]}... Error: {e}")
-        return []
             text = row.get(self.sources[i].text_field, None)
             if isinstance(text, str) and len(text) > 0:
                 yield text
     def _iter_token_ids(self) -> Iterator[int]:
         for text in self._iter_texts():
-            ids = self.tok.encode(text)
             if self.eos_id is not None:
                 ids.append(self.eos_id)
             for t in ids:
@@ -102,7 +110,12 @@ def _safe_encode(self, text: str) -> list:
         for tok_id in self._iter_token_ids():
             buf.append(tok_id)
             while len(buf) >= self.seq_len + 1:
-                x = torch.tensor(buf[: self.seq_len], dtype=torch.long)
-                y = torch.tensor(buf[1 : self.seq_len + 1], dtype=torch.long)
-                del buf[: self.seq_len]
                 yield x, y

 from transformers import PreTrainedTokenizerBase
 import yaml
 @dataclass
 class DataSource:
     name: str
     weight: int = 1
     streaming: bool = True
 def load_sources_from_yaml(path: str) -> List[DataSource]:
     with open(path, "r", encoding="utf-8") as f:
         cfg = yaml.safe_load(f)
     assert len(srcs) > 0, "No data sources configured"
     return srcs
 def build_streams(sources: List[DataSource]) -> List[Iterator[Dict]]:
     iters = []
     for s in sources:
         iters.append(iter(ds))
     return iters
 def weighted_choice(weights: List[int]) -> int:
     total = sum(weights)
     r = random.randint(1, total)
             return i
     return len(weights) - 1
 class TokenChunkDataset(IterableDataset):
     def __init__(
         self,
         iters = build_streams(self.sources)
         while True:
             i = weighted_choice(self.weights)
+            try:
+                row = next(iters[i])
+            except StopIteration:
+                try:
+                    ds = load_dataset(
+                        self.sources[i].hf_path,
+                        self.sources[i].hf_name,
+                        split=self.sources[i].split,
+                        streaming=self.sources[i].streaming
+                    )
+                    iters[i] = iter(ds)
+                    row = next(iters[i])
+                except (StopIteration, Exception) as e:
+                    print(f"Warning: Could not restart iterator for source {self.sources[i].name}: {e}")
+                    continue  # Skip this iteration and try next source
             text = row.get(self.sources[i].text_field, None)
             if isinstance(text, str) and len(text) > 0:
                 yield text
+    def _safe_encode(self, text: str) -> list:
+        try:
+            return self.tok.encode(text)
+        except Exception as e:
+            print(f"Encoding error for text: {text[:50]}... Error: {e}")
+            return []
     def _iter_token_ids(self) -> Iterator[int]:
         for text in self._iter_texts():
+            ids = self._safe_encode(text)
             if self.eos_id is not None:
                 ids.append(self.eos_id)
             for t in ids:
         for tok_id in self._iter_token_ids():
             buf.append(tok_id)
             while len(buf) >= self.seq_len + 1:
+                x = torch.tensor(buf[:self.seq_len], dtype=torch.long)
+                y = torch.tensor(buf[1:self.seq_len + 1], dtype=torch.long)
+                del buf[:self.seq_len]
                 yield x, y
+    def __len__(self):
+        # Provide approximate length for progress tracking
+        return 1000000  # Large number for streaming datasets