Neon-tech commited on
Commit
99d6b9d
·
verified ·
1 Parent(s): 5c9e20e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -189
app.py CHANGED
@@ -1,192 +1,10 @@
1
- import os
2
- import json
3
- import time
4
- import socket
5
- import threading
6
- import io
7
- import requests
8
- import pandas as pd
9
  from pathlib import Path
10
- from tokenizers import Tokenizer
11
- from huggingface_hub import HfApi
12
 
13
- # ── Config ───────────────────────────────────────────────────────────────────
14
- HF_TOKEN = os.environ.get("HF_TOKEN")
15
- DATASET_REPO = "Neon-coding/github-code-raw"
16
- TOK_PATH = "/data/tokenizer.json"
17
- OUT_DIR = "/data/by-language"
18
- STATE_FILE = "/data/progress_state.json"
19
- TOTAL_PARQUETS = 880
20
- SHARD_TOKENS = 100_000
21
-
22
- PARQUET_URL = (
23
- "https://huggingface.co/datasets/codeparrot/github-code-clean"
24
- "/resolve/main/data/train-{i:05d}-of-00880.parquet"
25
- )
26
-
27
- os.makedirs(OUT_DIR, exist_ok=True)
28
-
29
- api = HfApi(token=HF_TOKEN)
30
-
31
- # ── Port 7860 — keeps Space green ────────────────────────────────────────────
32
- def serve():
33
- s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34
- s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
35
- s.bind(("0.0.0.0", 7860))
36
- s.listen(5)
37
- print("✓ Listening on port 7860")
38
- while True:
39
- conn, _ = s.accept()
40
- conn.send(b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\n\r\nOK")
41
- conn.close()
42
-
43
- # ── State ────────────────────────────────────────────────────────────────────
44
- def load_state():
45
- if os.path.exists(STATE_FILE):
46
- with open(STATE_FILE) as f:
47
- state = json.load(f)
48
- print(f"Resuming — {len(state['done'])} / {TOTAL_PARQUETS} parquets done")
49
  else:
50
- state = {
51
- "done": [], # parquet indices completed
52
- "lang_shards": {}, # {lang: next shard index}
53
- "lang_tokens": {}, # {lang: total tokens written}
54
- }
55
- print("Starting fresh")
56
- return state
57
-
58
- def save_state(state):
59
- with open(STATE_FILE, "w") as f:
60
- json.dump(state, f, indent=2)
61
-
62
- # ── Shard buffers — global per language, persist across parquets ─────────────
63
- buffers = {} # {lang: {"rows": [...], "token_count": N}}
64
-
65
- def get_buffer(lang):
66
- if lang not in buffers:
67
- buffers[lang] = {"rows": [], "token_count": 0}
68
- return buffers[lang]
69
-
70
- def flush_shard(lang, rows, state):
71
- shard_idx = state["lang_shards"].get(lang, 0)
72
- lang_dir = Path(OUT_DIR) / lang
73
- lang_dir.mkdir(parents=True, exist_ok=True)
74
- shard_name = f"shard_{shard_idx:05d}.jsonl"
75
- shard_path = lang_dir / shard_name
76
-
77
- with open(shard_path, "w", encoding="utf-8") as f:
78
- for row in rows:
79
- f.write(json.dumps(row, ensure_ascii=False) + "\n")
80
-
81
- tok_in_shard = sum(r["token_count"] for r in rows)
82
- state["lang_shards"][lang] = shard_idx + 1
83
- state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
84
- print(f" ✓ {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
85
-
86
- # ── Main processing loop ─────────────────────────────────────────────────────
87
- def process(tokenizer, state):
88
- for i in range(TOTAL_PARQUETS):
89
- if i in state["done"]:
90
- print(f"[{i:05d}/{TOTAL_PARQUETS}] SKIP")
91
- continue
92
-
93
- url = PARQUET_URL.format(i=i)
94
- print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
95
-
96
- try:
97
- resp = requests.get(
98
- url,
99
- headers={"Authorization": f"Bearer {HF_TOKEN}"},
100
- timeout=180,
101
- )
102
- resp.raise_for_status()
103
- df = pd.read_parquet(io.BytesIO(resp.content))
104
- except Exception as e:
105
- print(f"[{i:05d}] Download error: {e} — skipping")
106
- continue
107
-
108
- print(f"[{i:05d}] {len(df):,} rows | {df['language'].nunique()} languages")
109
-
110
- for lang, group in df.groupby("language"):
111
- buf = get_buffer(lang)
112
- texts = group["code"].fillna("").tolist()
113
- repos = group["repo_name"].tolist()
114
- paths = group["path"].tolist()
115
- licenses = group["license"].tolist()
116
-
117
- encoded = tokenizer.encode_batch(texts)
118
-
119
- for idx, enc in enumerate(encoded):
120
- token_count = len(enc.ids)
121
- if token_count < 2:
122
- continue
123
-
124
- row = {
125
- "text": texts[idx],
126
- "token_count": token_count,
127
- "repo": repos[idx],
128
- "path": paths[idx],
129
- "license": licenses[idx],
130
- }
131
-
132
- # flush before adding if it would overflow
133
- # (if sample itself > 100k, still let it through — don't lose data)
134
- if buf["token_count"] + token_count > SHARD_TOKENS and buf["rows"]:
135
- flush_shard(lang, buf["rows"], state)
136
- save_state(state)
137
- buf["rows"] = []
138
- buf["token_count"] = 0
139
-
140
- buf["rows"].append(row)
141
- buf["token_count"] += token_count
142
-
143
- state["done"].append(i)
144
- save_state(state)
145
- print(f"[{i:05d}] ✓ Complete")
146
-
147
- # ── Flush remaining partial shards ────────────────────────────────────────
148
- print("\nFlushing remaining buffers...")
149
- for lang, buf in buffers.items():
150
- if buf["rows"]:
151
- flush_shard(lang, buf["rows"], state)
152
- save_state(state)
153
-
154
- # ── Write meta.json per language ──────────────────────────────────────────
155
- print("\nWriting meta.json per language...")
156
- for lang in state["lang_tokens"]:
157
- meta = {
158
- "language": lang,
159
- "total_tokens": state["lang_tokens"][lang],
160
- "total_shards": state["lang_shards"].get(lang, 0),
161
- }
162
- meta_path = Path(OUT_DIR) / lang / "meta.json"
163
- with open(meta_path, "w") as f:
164
- json.dump(meta, f, indent=2)
165
- print(f" {lang}: {meta['total_tokens']:,} tokens | {meta['total_shards']} shards")
166
-
167
- # ── Push everything to HF dataset repo ───────────────────────────────────
168
- print(f"\nPushing to {DATASET_REPO}...")
169
- api.upload_folder(
170
- folder_path=OUT_DIR,
171
- repo_id=DATASET_REPO,
172
- repo_type="dataset",
173
- token=HF_TOKEN,
174
- )
175
- print("\n✓ All done!")
176
-
177
- # ── Entry point ──────────────────────────────────────────────────────────────
178
- if __name__ == "__main__":
179
- # start port listener first — Space goes green immediately
180
- threading.Thread(target=serve, daemon=True).start()
181
-
182
- print("✓ Loading tokenizer from /data/tokenizer.json...")
183
- tokenizer = Tokenizer.from_file(TOK_PATH)
184
- print(f"✓ Tokenizer loaded | vocab: {tokenizer.get_vocab_size():,}")
185
-
186
- state = load_state()
187
-
188
- threading.Thread(target=process, args=(tokenizer, state), daemon=True).start()
189
-
190
- # main thread keeps Space alive
191
- while True:
192
- time.sleep(60)
 
1
+ import shutil
 
 
 
 
 
 
 
2
  from pathlib import Path
 
 
3
 
4
+ for folder in ["/data/by-language", "/data/codeparrot-raw"]:
5
+ p = Path(folder)
6
+ if p.exists():
7
+ shutil.rmtree(p)
8
+ print(f"✓ Deleted {folder}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  else:
10
+ print(f" Skipped {folder} — not found")