Neon-tech commited on
Commit
39937a7
Β·
verified Β·
1 Parent(s): f1a8e39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -7
app.py CHANGED
@@ -1,10 +1,194 @@
1
- import shutil
 
 
 
 
 
 
 
2
  from pathlib import Path
 
 
3
 
4
- for folder in ["/data/by-language", "/data/progress_state.json"]:
5
- p = Path(folder)
6
- if p.exists():
7
- shutil.rmtree(p)
8
- print(f"βœ“ Deleted {folder}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  else:
10
- print(f" Skipped {folder} β€” not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import socket
5
+ import threading
6
+ import io
7
+ import requests
8
+ import pandas as pd
9
  from pathlib import Path
10
+ from tokenizers import Tokenizer
11
+ from huggingface_hub import HfApi
12
 
13
+ # ── Config ───────────────────────────────────────────────────────────────────
14
+ HF_TOKEN = os.environ.get("HF_TOKEN")
15
+ DATASET_REPO = "Neon-coding/github-code-raw"
16
+ TOK_PATH = "/data/tokenizer.json"
17
+ OUT_DIR = "/data/by-language"
18
+ STATE_FILE = "/data/progress_state.json"
19
+ TOTAL_PARQUETS = 880
20
+ SHARD_TOKENS = 1_000_000 # 1M tokens per shard
21
+
22
+ PARQUET_URL = (
23
+ "https://huggingface.co/datasets/codeparrot/github-code-clean"
24
+ "/resolve/main/data/train-{i:05d}-of-00880.parquet"
25
+ )
26
+
27
+ os.makedirs(OUT_DIR, exist_ok=True)
28
+
29
+ api = HfApi(token=HF_TOKEN)
30
+
31
+ # ── Port 7860 β€” keeps Space green ────────────────────────────────────────────
32
+ def serve():
33
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
35
+ s.bind(("0.0.0.0", 7860))
36
+ s.listen(5)
37
+ print("βœ“ Listening on port 7860")
38
+ while True:
39
+ conn, _ = s.accept()
40
+ conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
41
+ conn.close()
42
+
43
+ # ── State ────────────────────────────────────────────────────────────────────
44
+ def load_state():
45
+ if os.path.exists(STATE_FILE):
46
+ with open(STATE_FILE) as f:
47
+ state = json.load(f)
48
+ print(f"Resuming β€” {len(state['done'])} / {TOTAL_PARQUETS} parquets done")
49
  else:
50
+ state = {
51
+ "done": [],
52
+ "lang_shards": {},
53
+ "lang_tokens": {},
54
+ }
55
+ print("Starting fresh")
56
+ return state
57
+
58
+ def save_state(state):
59
+ with open(STATE_FILE, "w") as f:
60
+ json.dump(state, f, indent=2)
61
+
62
+ # ── Shard buffers β€” global per language, persist across parquets ─────────────
63
+ buffers = {}
64
+
65
+ def get_buffer(lang):
66
+ if lang not in buffers:
67
+ buffers[lang] = {"rows": [], "token_count": 0}
68
+ return buffers[lang]
69
+
70
+ def flush_shard(lang, rows, state):
71
+ shard_idx = state["lang_shards"].get(lang, 0)
72
+ lang_dir = Path(OUT_DIR) / lang
73
+ lang_dir.mkdir(parents=True, exist_ok=True)
74
+ shard_name = f"shard_{shard_idx:05d}.jsonl"
75
+ shard_path = lang_dir / shard_name
76
+
77
+ with open(shard_path, "w", encoding="utf-8") as f:
78
+ for row in rows:
79
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
80
+
81
+ tok_in_shard = sum(r["token_count"] for r in rows)
82
+ state["lang_shards"][lang] = shard_idx + 1
83
+ state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
84
+ # print(f" βœ“ {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
85
+
86
+ # ── Main processing loop ─────────────────────────────────────────────────────
87
+ def process(tokenizer, state):
88
+ for i in range(TOTAL_PARQUETS):
89
+ if i in state["done"]:
90
+ print(f"[{i:05d}/{TOTAL_PARQUETS}] SKIP")
91
+ continue
92
+
93
+ url = PARQUET_URL.format(i=i)
94
+ #print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
95
+
96
+ try:
97
+ resp = requests.get(
98
+ url,
99
+ headers={"Authorization": f"Bearer {HF_TOKEN}"},
100
+ timeout=180,
101
+ )
102
+ resp.raise_for_status()
103
+ df = pd.read_parquet(io.BytesIO(resp.content))
104
+ except Exception as e:
105
+ print(f"[{i:05d}] Download error: {e} β€” skipping")
106
+ continue
107
+
108
+ print(f"[{i:05d}] {len(df):,} rows | {df['language'].nunique()} languages")
109
+
110
+ # row by row β€” constant memory
111
+ for row_tuple in df.itertuples(index=False):
112
+ lang = row_tuple.language
113
+ text = row_tuple.code if row_tuple.code else ""
114
+ repo = row_tuple.repo_name
115
+ fpath = row_tuple.path
116
+ lic = row_tuple.license
117
+
118
+ if not text.strip():
119
+ continue
120
+
121
+ enc = tokenizer.encode(text)
122
+ token_count = len(enc.ids)
123
+
124
+ if token_count < 2:
125
+ continue
126
+
127
+ buf = get_buffer(lang)
128
+ row = {
129
+ "text": text,
130
+ "token_count": token_count,
131
+ "repo": repo,
132
+ "path": fpath,
133
+ "license": lic,
134
+ }
135
+
136
+ if buf["token_count"] + token_count > SHARD_TOKENS and buf["rows"]:
137
+ flush_shard(lang, buf["rows"], state)
138
+ save_state(state)
139
+ buf["rows"] = []
140
+ buf["token_count"] = 0
141
+
142
+ buf["rows"].append(row)
143
+ buf["token_count"] += token_count
144
+
145
+ del df
146
+
147
+ state["done"].append(i)
148
+ save_state(state)
149
+ #print(f"[{i:05d}] βœ“ Complete")
150
+
151
+ # ── Flush remaining partial shards ────────────────────────────────────────
152
+ print("\nFlushing remaining buffers...")
153
+ for lang, buf in buffers.items():
154
+ if buf["rows"]:
155
+ flush_shard(lang, buf["rows"], state)
156
+ save_state(state)
157
+
158
+ # ── Write meta.json per language ──────────────────────────────────────────
159
+ print("\nWriting meta.json per language...")
160
+ for lang in state["lang_tokens"]:
161
+ meta = {
162
+ "language": lang,
163
+ "total_tokens": state["lang_tokens"][lang],
164
+ "total_shards": state["lang_shards"].get(lang, 0),
165
+ }
166
+ meta_path = Path(OUT_DIR) / lang / "meta.json"
167
+ with open(meta_path, "w") as f:
168
+ json.dump(meta, f, indent=2)
169
+ print(f" {lang}: {meta['total_tokens']:,} tokens | {meta['total_shards']} shards")
170
+
171
+ # ── Push everything to HF dataset repo ───────────────────────────────────
172
+ print(f"\nPushing to {DATASET_REPO}...")
173
+ api.upload_folder(
174
+ folder_path=OUT_DIR,
175
+ repo_id=DATASET_REPO,
176
+ repo_type="dataset",
177
+ token=HF_TOKEN,
178
+ )
179
+ print("\nβœ“ All done!")
180
+
181
+ # ── Entry point ──────────────────────────────────────────────────────────────
182
+ if __name__ == "__main__":
183
+ threading.Thread(target=serve, daemon=True).start()
184
+
185
+ print("βœ“ Loading tokenizer from /data/tokenizer.json...")
186
+ tokenizer = Tokenizer.from_file(TOK_PATH)
187
+ print(f"βœ“ Tokenizer loaded | vocab: {tokenizer.get_vocab_size():,}")
188
+
189
+ state = load_state()
190
+
191
+ threading.Thread(target=process, args=(tokenizer, state), daemon=True).start()
192
+
193
+ while True:
194
+ time.sleep(60)