Neon-tech commited on
Commit
5c9e20e
Β·
verified Β·
1 Parent(s): 63875bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -7
app.py CHANGED
@@ -1,10 +1,192 @@
1
- import shutil
 
 
 
 
 
 
 
2
  from pathlib import Path
 
 
3
 
4
- for folder in ["/data/by-language", "/data/codeparrot-raw"]:
5
- p = Path(folder)
6
- if p.exists():
7
- shutil.rmtree(p)
8
- print(f"βœ“ Deleted {folder}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  else:
10
- print(f" Skipped {folder} β€” not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import socket
5
+ import threading
6
+ import io
7
+ import requests
8
+ import pandas as pd
9
  from pathlib import Path
10
+ from tokenizers import Tokenizer
11
+ from huggingface_hub import HfApi
12
 
13
+ # ── Config ───────────────────────────────────────────────────────────────────
14
+ HF_TOKEN = os.environ.get("HF_TOKEN")
15
+ DATASET_REPO = "Neon-coding/github-code-raw"
16
+ TOK_PATH = "/data/tokenizer.json"
17
+ OUT_DIR = "/data/by-language"
18
+ STATE_FILE = "/data/progress_state.json"
19
+ TOTAL_PARQUETS = 880
20
+ SHARD_TOKENS = 100_000
21
+
22
+ PARQUET_URL = (
23
+ "https://huggingface.co/datasets/codeparrot/github-code-clean"
24
+ "/resolve/main/data/train-{i:05d}-of-00880.parquet"
25
+ )
26
+
27
+ os.makedirs(OUT_DIR, exist_ok=True)
28
+
29
+ api = HfApi(token=HF_TOKEN)
30
+
31
+ # ── Port 7860 β€” keeps Space green ────────────────────────────────────────────
32
+ def serve():
33
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
35
+ s.bind(("0.0.0.0", 7860))
36
+ s.listen(5)
37
+ print("βœ“ Listening on port 7860")
38
+ while True:
39
+ conn, _ = s.accept()
40
+ conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
41
+ conn.close()
42
+
43
+ # ── State ────────────────────────────────────────────────────────────────────
44
+ def load_state():
45
+ if os.path.exists(STATE_FILE):
46
+ with open(STATE_FILE) as f:
47
+ state = json.load(f)
48
+ print(f"Resuming β€” {len(state['done'])} / {TOTAL_PARQUETS} parquets done")
49
  else:
50
+ state = {
51
+ "done": [], # parquet indices completed
52
+ "lang_shards": {}, # {lang: next shard index}
53
+ "lang_tokens": {}, # {lang: total tokens written}
54
+ }
55
+ print("Starting fresh")
56
+ return state
57
+
58
+ def save_state(state):
59
+ with open(STATE_FILE, "w") as f:
60
+ json.dump(state, f, indent=2)
61
+
62
+ # ── Shard buffers β€” global per language, persist across parquets ─────────────
63
+ buffers = {} # {lang: {"rows": [...], "token_count": N}}
64
+
65
+ def get_buffer(lang):
66
+ if lang not in buffers:
67
+ buffers[lang] = {"rows": [], "token_count": 0}
68
+ return buffers[lang]
69
+
70
+ def flush_shard(lang, rows, state):
71
+ shard_idx = state["lang_shards"].get(lang, 0)
72
+ lang_dir = Path(OUT_DIR) / lang
73
+ lang_dir.mkdir(parents=True, exist_ok=True)
74
+ shard_name = f"shard_{shard_idx:05d}.jsonl"
75
+ shard_path = lang_dir / shard_name
76
+
77
+ with open(shard_path, "w", encoding="utf-8") as f:
78
+ for row in rows:
79
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
80
+
81
+ tok_in_shard = sum(r["token_count"] for r in rows)
82
+ state["lang_shards"][lang] = shard_idx + 1
83
+ state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
84
+ print(f" βœ“ {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
85
+
86
+ # ── Main processing loop ─────────────────────────────────────────────────────
87
+ def process(tokenizer, state):
88
+ for i in range(TOTAL_PARQUETS):
89
+ if i in state["done"]:
90
+ print(f"[{i:05d}/{TOTAL_PARQUETS}] SKIP")
91
+ continue
92
+
93
+ url = PARQUET_URL.format(i=i)
94
+ print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
95
+
96
+ try:
97
+ resp = requests.get(
98
+ url,
99
+ headers={"Authorization": f"Bearer {HF_TOKEN}"},
100
+ timeout=180,
101
+ )
102
+ resp.raise_for_status()
103
+ df = pd.read_parquet(io.BytesIO(resp.content))
104
+ except Exception as e:
105
+ print(f"[{i:05d}] Download error: {e} β€” skipping")
106
+ continue
107
+
108
+ print(f"[{i:05d}] {len(df):,} rows | {df['language'].nunique()} languages")
109
+
110
+ for lang, group in df.groupby("language"):
111
+ buf = get_buffer(lang)
112
+ texts = group["code"].fillna("").tolist()
113
+ repos = group["repo_name"].tolist()
114
+ paths = group["path"].tolist()
115
+ licenses = group["license"].tolist()
116
+
117
+ encoded = tokenizer.encode_batch(texts)
118
+
119
+ for idx, enc in enumerate(encoded):
120
+ token_count = len(enc.ids)
121
+ if token_count < 2:
122
+ continue
123
+
124
+ row = {
125
+ "text": texts[idx],
126
+ "token_count": token_count,
127
+ "repo": repos[idx],
128
+ "path": paths[idx],
129
+ "license": licenses[idx],
130
+ }
131
+
132
+ # flush before adding if it would overflow
133
+ # (if sample itself > 100k, still let it through β€” don't lose data)
134
+ if buf["token_count"] + token_count > SHARD_TOKENS and buf["rows"]:
135
+ flush_shard(lang, buf["rows"], state)
136
+ save_state(state)
137
+ buf["rows"] = []
138
+ buf["token_count"] = 0
139
+
140
+ buf["rows"].append(row)
141
+ buf["token_count"] += token_count
142
+
143
+ state["done"].append(i)
144
+ save_state(state)
145
+ print(f"[{i:05d}] βœ“ Complete")
146
+
147
+ # ── Flush remaining partial shards ────────────────────────────────────────
148
+ print("\nFlushing remaining buffers...")
149
+ for lang, buf in buffers.items():
150
+ if buf["rows"]:
151
+ flush_shard(lang, buf["rows"], state)
152
+ save_state(state)
153
+
154
+ # ── Write meta.json per language ──────────────────────────────────────────
155
+ print("\nWriting meta.json per language...")
156
+ for lang in state["lang_tokens"]:
157
+ meta = {
158
+ "language": lang,
159
+ "total_tokens": state["lang_tokens"][lang],
160
+ "total_shards": state["lang_shards"].get(lang, 0),
161
+ }
162
+ meta_path = Path(OUT_DIR) / lang / "meta.json"
163
+ with open(meta_path, "w") as f:
164
+ json.dump(meta, f, indent=2)
165
+ print(f" {lang}: {meta['total_tokens']:,} tokens | {meta['total_shards']} shards")
166
+
167
+ # ── Push everything to HF dataset repo ───────────────────────────────────
168
+ print(f"\nPushing to {DATASET_REPO}...")
169
+ api.upload_folder(
170
+ folder_path=OUT_DIR,
171
+ repo_id=DATASET_REPO,
172
+ repo_type="dataset",
173
+ token=HF_TOKEN,
174
+ )
175
+ print("\nβœ“ All done!")
176
+
177
+ # ── Entry point ──────────────────────────────────────────────────────────────
178
+ if __name__ == "__main__":
179
+ # start port listener first β€” Space goes green immediately
180
+ threading.Thread(target=serve, daemon=True).start()
181
+
182
+ print("βœ“ Loading tokenizer from /data/tokenizer.json...")
183
+ tokenizer = Tokenizer.from_file(TOK_PATH)
184
+ print(f"βœ“ Tokenizer loaded | vocab: {tokenizer.get_vocab_size():,}")
185
+
186
+ state = load_state()
187
+
188
+ threading.Thread(target=process, args=(tokenizer, state), daemon=True).start()
189
+
190
+ # main thread keeps Space alive
191
+ while True:
192
+ time.sleep(60)