Spaces:

Neon-coding
/

Atrain

Runtime error

App Files Files Community

Neon-tech commited on May 30

Commit

f650f50

verified ·

1 Parent(s): 5ac0c16

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -5

app.py CHANGED Viewed

@@ -1,6 +1,77 @@
-import subprocess, os
-subprocess.run([
-   "curl", "-L", "-o", "/data/train.bin",
-   "https://storage.googleapis.com/kagglesdsdata/datasets/10431689/16278810/train.bin?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20260529%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260529T184132Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3a6fe517d993ef6b350e3361fd1d7cfda6eddbdd7ef31da13428018032bc6ac22a7f8ce04098f7f5ca4d9e269e3c77d009adbb5d3d9c234fac2fffac297f548231c1a3a42cb421a24d85780e56948788ecbf24c7ed90e8e1e0f31ff011fd8a6e162bda93f96a6764f0e7ec5387eeaf594e201eb346a48413458d6e51f7f16c230d7f90cb7db6ad51584700dd611d5cb88f8ab825c9103545c974ab86b2180fa8e8b4f259cd5e0d78c693484d17b7b7a472d428ff67d06b372beff2dc60ccfc86043c07b71cc42a25e1a3418b80cb0abd7ffc9b06e255de7a2add2013ae8cc5e4b354d537b7b86b3e335cbb4a2e491fb2d80a1235ffa7f80294bd0fe4d677c354"
-], check=True)

+import numpy as np, os
+from pathlib import Path
+SEP_ID      = 6  # <sep> token id — confirm this
+DOCS_PER_SOURCE = 100  # 100 docs per source = ~2000 total
+OUT_PATH    = "/data/val.bin"
+# All source files except textbook
+SOURCES = [
+    # fineweb
+    "tokenized/fineweb__000_00007.bin",
+    # wikipedia
+    "tokenized/wikipedia__train-00005-of-00041.bin",
+    # openwebmath
+    "tokenized/openwebmath__train-00000-of-00114.bin",
+    # phi
+    "tokenized/phi__programming_books.bin",
+    # code — all 16 languages
+    "tokenized/code__shard_000000_Python.bin",
+    "tokenized/code__shard_000000_JavaScript.bin",
+    "tokenized/code__shard_000000_TypeScript.bin",
+    "tokenized/code__shard_000000_Shell.bin",
+    "tokenized/code__shard_000000_C.bin",
+    "tokenized/code__shard_000000_C++.bin",
+    "tokenized/code__shard_000000_Java.bin",
+    "tokenized/code__shard_000000_Go.bin",
+    "tokenized/code__shard_000000_Rust.bin",
+    "tokenized/code__shard_000000_Ruby.bin",
+    "tokenized/code__shard_000000_PHP.bin",
+    "tokenized/code__shard_000000_SQL.bin",
+    "tokenized/code__shard_000000_C%23.bin",
+    "tokenized/code__shard_000000_Scala.bin",
+    "tokenized/code__shard_000000_Lua.bin",
+    "tokenized/code__shard_000000_Perl.bin",
+]
+def extract_docs(bin_path, sep_id, n_docs):
+    """Stream file, split on sep, return first n_docs."""
+    docs    = []
+    current = []
+    CHUNK   = 1_000_000
+    with open(bin_path, "rb") as f:
+        while len(docs) < n_docs:
+            raw = f.read(CHUNK * 2)
+            if not raw:
+                break
+            tokens = np.frombuffer(raw, dtype=np.uint16)
+            for tok in tokens:
+                if tok == sep_id:
+                    if current:
+                        docs.append(np.array(current, dtype=np.uint16))
+                        current = []
+                        if len(docs) >= n_docs:
+                            break
+                else:
+                    current.append(int(tok))
+    return docs
+all_docs = []
+for src in SOURCES:
+    path = f"/data/{src}"
+    if not os.path.exists(path):
+        print(f"  Missing: {src}")
+        continue
+    docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE)
+    all_docs.extend(docs)
+    print(f"  {src.split('/')[-1]}: {len(docs)} docs")
+print(f"\nTotal val docs: {len(all_docs):,}")
+# Write to val.bin
+with open(OUT_PATH, "wb") as f:
+    for doc in all_docs:
+        doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16)
+        doc_with_sep.tofile(f)
+print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")