Neon-tech commited on
Commit
f650f50
·
verified ·
1 Parent(s): 5ac0c16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -5
app.py CHANGED
@@ -1,6 +1,77 @@
1
- import subprocess, os
 
2
 
3
- subprocess.run([
4
- "curl", "-L", "-o", "/data/train.bin",
5
- "https://storage.googleapis.com/kagglesdsdata/datasets/10431689/16278810/train.bin?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20260529%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260529T184132Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3a6fe517d993ef6b350e3361fd1d7cfda6eddbdd7ef31da13428018032bc6ac22a7f8ce04098f7f5ca4d9e269e3c77d009adbb5d3d9c234fac2fffac297f548231c1a3a42cb421a24d85780e56948788ecbf24c7ed90e8e1e0f31ff011fd8a6e162bda93f96a6764f0e7ec5387eeaf594e201eb346a48413458d6e51f7f16c230d7f90cb7db6ad51584700dd611d5cb88f8ab825c9103545c974ab86b2180fa8e8b4f259cd5e0d78c693484d17b7b7a472d428ff67d06b372beff2dc60ccfc86043c07b71cc42a25e1a3418b80cb0abd7ffc9b06e255de7a2add2013ae8cc5e4b354d537b7b86b3e335cbb4a2e491fb2d80a1235ffa7f80294bd0fe4d677c354"
6
- ], check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, os
2
+ from pathlib import Path
3
 
4
+ SEP_ID = 6 # <sep> token id — confirm this
5
+ DOCS_PER_SOURCE = 100 # 100 docs per source = ~2000 total
6
+ OUT_PATH = "/data/val.bin"
7
+
8
+ # All source files except textbook
9
+ SOURCES = [
10
+ # fineweb
11
+ "tokenized/fineweb__000_00007.bin",
12
+ # wikipedia
13
+ "tokenized/wikipedia__train-00005-of-00041.bin",
14
+ # openwebmath
15
+ "tokenized/openwebmath__train-00000-of-00114.bin",
16
+ # phi
17
+ "tokenized/phi__programming_books.bin",
18
+ # code — all 16 languages
19
+ "tokenized/code__shard_000000_Python.bin",
20
+ "tokenized/code__shard_000000_JavaScript.bin",
21
+ "tokenized/code__shard_000000_TypeScript.bin",
22
+ "tokenized/code__shard_000000_Shell.bin",
23
+ "tokenized/code__shard_000000_C.bin",
24
+ "tokenized/code__shard_000000_C++.bin",
25
+ "tokenized/code__shard_000000_Java.bin",
26
+ "tokenized/code__shard_000000_Go.bin",
27
+ "tokenized/code__shard_000000_Rust.bin",
28
+ "tokenized/code__shard_000000_Ruby.bin",
29
+ "tokenized/code__shard_000000_PHP.bin",
30
+ "tokenized/code__shard_000000_SQL.bin",
31
+ "tokenized/code__shard_000000_C%23.bin",
32
+ "tokenized/code__shard_000000_Scala.bin",
33
+ "tokenized/code__shard_000000_Lua.bin",
34
+ "tokenized/code__shard_000000_Perl.bin",
35
+ ]
36
+
37
+ def extract_docs(bin_path, sep_id, n_docs):
38
+ """Stream file, split on sep, return first n_docs."""
39
+ docs = []
40
+ current = []
41
+ CHUNK = 1_000_000
42
+ with open(bin_path, "rb") as f:
43
+ while len(docs) < n_docs:
44
+ raw = f.read(CHUNK * 2)
45
+ if not raw:
46
+ break
47
+ tokens = np.frombuffer(raw, dtype=np.uint16)
48
+ for tok in tokens:
49
+ if tok == sep_id:
50
+ if current:
51
+ docs.append(np.array(current, dtype=np.uint16))
52
+ current = []
53
+ if len(docs) >= n_docs:
54
+ break
55
+ else:
56
+ current.append(int(tok))
57
+ return docs
58
+
59
+ all_docs = []
60
+ for src in SOURCES:
61
+ path = f"/data/{src}"
62
+ if not os.path.exists(path):
63
+ print(f" Missing: {src}")
64
+ continue
65
+ docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE)
66
+ all_docs.extend(docs)
67
+ print(f" {src.split('/')[-1]}: {len(docs)} docs")
68
+
69
+ print(f"\nTotal val docs: {len(all_docs):,}")
70
+
71
+ # Write to val.bin
72
+ with open(OUT_PATH, "wb") as f:
73
+ for doc in all_docs:
74
+ doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16)
75
+ doc_with_sep.tofile(f)
76
+
77
+ print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")