Neon-tech commited on
Commit
63875bc
·
verified ·
1 Parent(s): 2623698

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -223
app.py CHANGED
@@ -1,226 +1,10 @@
1
- import os
2
- import json
3
- import time
4
- import threading
5
- import io
6
- import requests
7
- import pandas as pd
8
  from pathlib import Path
9
- from tokenizers import Tokenizer
10
- from huggingface_hub import HfApi
11
- import socket
12
- import threading
13
- import time
14
 
15
- def keep_alive():
16
- s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
17
- s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
18
- s.bind(("0.0.0.0", 7860))
19
- s.listen(5)
20
- while True:
21
- conn, _ = s.accept()
22
- conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
23
- conn.close()
24
- # ── Config ───────────────────────────────────────────────────────────────────
25
- HF_TOKEN = os.environ.get("HF_TOKEN")
26
- HF_USERNAME = "Neon-coding"
27
- DATASET_REPO = f"{HF_USERNAME}/github-code-raw"
28
- BUCKET_REPO = f"{HF_USERNAME}/ureola-bucket" # where tokenizer.json lives
29
- OUT_DIR = "/data/by-language"
30
- STATE_FILE = "/data/progress_state.json"
31
- TOK_FILENAME = "tokenizer.json"
32
- TOTAL_PARQUETS = 880
33
- SHARD_TOKENS = 100_000 # exactly 100k tokens per shard file
34
-
35
- PARQUET_URL = (
36
- "https://huggingface.co/datasets/codeparrot/github-code-clean"
37
- "/resolve/main/data/train-{i:05d}-of-00880.parquet"
38
- )
39
-
40
- os.makedirs(OUT_DIR, exist_ok=True)
41
- os.makedirs("/data", exist_ok=True)
42
-
43
- api = HfApi(token=HF_TOKEN)
44
-
45
- # ── Pull tokenizer.json from bucket ─────────────────────────────────────────
46
- def load_tokenizer():
47
- tok_path = f"/data/{TOK_FILENAME}"
48
- if not os.path.exists(tok_path):
49
- print("Pulling tokenizer.json from bucket...")
50
- api.hf_hub_download(
51
- repo_id=BUCKET_REPO,
52
- repo_type="dataset",
53
- filename=TOK_FILENAME,
54
- local_dir="/data",
55
- token=HF_TOKEN,
56
- )
57
- tokenizer = Tokenizer.from_file(tok_path)
58
- print(f"✓ Tokenizer loaded | vocab: {tokenizer.get_vocab_size():,}")
59
- return tokenizer
60
-
61
- # ── State ────────────────────────────────────────────────────────────────────
62
- def load_state():
63
- if os.path.exists(STATE_FILE):
64
- with open(STATE_FILE) as f:
65
- state = json.load(f)
66
- print(f"Resuming — {len(state['done'])} parquets done")
67
  else:
68
- state = {
69
- "done": [], # list of parquet indices completed
70
- "lang_shards": {}, # {lang: current shard index}
71
- "lang_tokens": {}, # {lang: total tokens written so far}
72
- }
73
- print("Starting fresh")
74
- return state
75
-
76
- def save_state(state):
77
- with open(STATE_FILE, "w") as f:
78
- json.dump(state, f, indent=2)
79
-
80
- # ── Shard buffer: one per language, persists across parquets ─────────────────
81
- # buffers[lang] = {"rows": [...], "token_count": N}
82
- buffers = {}
83
-
84
- def get_buffer(lang):
85
- if lang not in buffers:
86
- buffers[lang] = {"rows": [], "token_count": 0}
87
- return buffers[lang]
88
-
89
- def flush_shard(lang, rows, state):
90
- """Write rows to a new shard file and upload to HF dataset repo."""
91
- shard_idx = state["lang_shards"].get(lang, 0)
92
- lang_dir = Path(OUT_DIR) / lang
93
- lang_dir.mkdir(parents=True, exist_ok=True)
94
-
95
- shard_name = f"shard_{shard_idx:05d}.jsonl"
96
- shard_path = lang_dir / shard_name
97
-
98
- with open(shard_path, "w") as f:
99
- for row in rows:
100
- f.write(json.dumps(row, ensure_ascii=False) + "\n")
101
-
102
- # upload to HF
103
- api.upload_file(
104
- path_or_fileobj=str(shard_path),
105
- path_in_repo=f"{lang}/{shard_name}",
106
- repo_id=DATASET_REPO,
107
- repo_type="dataset",
108
- token=HF_TOKEN,
109
- )
110
- print(f" ✓ Uploaded {lang}/{shard_name} | {len(rows)} samples")
111
-
112
- # update state
113
- state["lang_shards"][lang] = shard_idx + 1
114
- state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + sum(
115
- r["token_count"] for r in rows
116
- )
117
-
118
- # ── Core processing loop ─────────────────────────────────────────────────────
119
- def process(tokenizer, state):
120
- for i in range(TOTAL_PARQUETS):
121
- if i in state["done"]:
122
- print(f"[{i:05d}] SKIP")
123
- continue
124
-
125
- url = PARQUET_URL.format(i=i)
126
- print(f"[{i:05d}] Downloading...")
127
-
128
- try:
129
- resp = requests.get(
130
- url,
131
- headers={"Authorization": f"Bearer {HF_TOKEN}"},
132
- timeout=120,
133
- )
134
- resp.raise_for_status()
135
- df = pd.read_parquet(io.BytesIO(resp.content))
136
- except Exception as e:
137
- print(f"[{i:05d}] Download error: {e} — skipping")
138
- continue
139
-
140
- print(f"[{i:05d}] {len(df):,} rows | processing...")
141
-
142
- for lang, group in df.groupby("language"):
143
- buf = get_buffer(lang)
144
-
145
- texts = group["code"].fillna("").tolist()
146
- repos = group["repo_name"].tolist()
147
- paths = group["path"].tolist()
148
- licenses = group["license"].tolist()
149
-
150
- encoded = tokenizer.encode_batch(texts)
151
-
152
- for idx, enc in enumerate(encoded):
153
- token_count = len(enc.ids)
154
-
155
- # skip junk (empty or single token)
156
- if token_count < 2:
157
- continue
158
-
159
- row = {
160
- "text": texts[idx],
161
- "token_count": token_count,
162
- "repo": repos[idx],
163
- "path": paths[idx],
164
- "license": licenses[idx],
165
- }
166
-
167
- # if this single sample alone exceeds shard size, still include it
168
- # — don't lose real data, just let that shard be a bit over
169
- if buf["token_count"] + token_count > SHARD_TOKENS and buf["rows"]:
170
- # flush current buffer first
171
- flush_shard(lang, buf["rows"], state)
172
- save_state(state)
173
- buf["rows"] = []
174
- buf["token_count"] = 0
175
-
176
- buf["rows"].append(row)
177
- buf["token_count"] += token_count
178
-
179
- state["done"].append(i)
180
- save_state(state)
181
- print(f"[{i:05d}] ✓ Done")
182
-
183
- # ── Flush any remaining partial shards ───────────────────────────────────
184
- print("\nFlushing remaining buffers...")
185
- for lang, buf in buffers.items():
186
- if buf["rows"]:
187
- flush_shard(lang, buf["rows"], state)
188
- save_state(state)
189
-
190
- # ── Write per-language meta ───────────────────────────────────────────────
191
- print("\nWriting meta.json per language...")
192
- for lang, total_tokens in state["lang_tokens"].items():
193
- meta = {
194
- "language": lang,
195
- "total_tokens": total_tokens,
196
- "total_shards": state["lang_shards"].get(lang, 0),
197
- }
198
- meta_path = Path(OUT_DIR) / lang / "meta.json"
199
- with open(meta_path, "w") as f:
200
- json.dump(meta, f, indent=2)
201
- api.upload_file(
202
- path_or_fileobj=str(meta_path),
203
- path_in_repo=f"{lang}/meta.json",
204
- repo_id=DATASET_REPO,
205
- repo_type="dataset",
206
- token=HF_TOKEN,
207
- )
208
- print(f" {lang}: {total_tokens:,} tokens | {meta['total_shards']} shards")
209
-
210
- print("\n✓ All done!")
211
-
212
- # ── Entry point ──────────────────────────────────────────────────────────────
213
- def main():
214
- tokenizer = load_tokenizer()
215
- state = load_state()
216
-
217
- # fire processing in background so Space stays alive
218
- t = threading.Thread(target=process, args=(tokenizer, state), daemon=True)
219
- t.start()
220
-
221
- # keep the Space running
222
- while True:
223
- time.sleep(60)
224
-
225
- if __name__ == "__main__":
226
- main()
 
1
+ import shutil
 
 
 
 
 
 
2
  from pathlib import Path
 
 
 
 
 
3
 
4
+ for folder in ["/data/by-language", "/data/codeparrot-raw"]:
5
+ p = Path(folder)
6
+ if p.exists():
7
+ shutil.rmtree(p)
8
+ print(f"✓ Deleted {folder}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  else:
10
+ print(f" Skipped {folder} — not found")