Update app.py
Browse files
app.py
CHANGED
|
@@ -81,7 +81,7 @@ def flush_shard(lang, rows, state):
|
|
| 81 |
tok_in_shard = sum(r["token_count"] for r in rows)
|
| 82 |
state["lang_shards"][lang] = shard_idx + 1
|
| 83 |
state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
|
| 84 |
-
|
| 85 |
|
| 86 |
# ββ Main processing loop βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
def process(tokenizer, state):
|
|
@@ -91,7 +91,7 @@ def process(tokenizer, state):
|
|
| 91 |
continue
|
| 92 |
|
| 93 |
url = PARQUET_URL.format(i=i)
|
| 94 |
-
|
| 95 |
|
| 96 |
try:
|
| 97 |
resp = requests.get(
|
|
@@ -146,7 +146,7 @@ def process(tokenizer, state):
|
|
| 146 |
|
| 147 |
state["done"].append(i)
|
| 148 |
save_state(state)
|
| 149 |
-
|
| 150 |
|
| 151 |
# ββ Flush remaining partial shards ββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
print("\nFlushing remaining buffers...")
|
|
|
|
| 81 |
tok_in_shard = sum(r["token_count"] for r in rows)
|
| 82 |
state["lang_shards"][lang] = shard_idx + 1
|
| 83 |
state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
|
| 84 |
+
print(f" β {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
|
| 85 |
|
| 86 |
# ββ Main processing loop βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
def process(tokenizer, state):
|
|
|
|
| 91 |
continue
|
| 92 |
|
| 93 |
url = PARQUET_URL.format(i=i)
|
| 94 |
+
print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
|
| 95 |
|
| 96 |
try:
|
| 97 |
resp = requests.get(
|
|
|
|
| 146 |
|
| 147 |
state["done"].append(i)
|
| 148 |
save_state(state)
|
| 149 |
+
print(f"[{i:05d}] β Complete")
|
| 150 |
|
| 151 |
# ββ Flush remaining partial shards ββββββββββββββββββββββββββββββββββββββββ
|
| 152 |
print("\nFlushing remaining buffers...")
|