Neon-tech commited on
Commit
b78a519
Β·
verified Β·
1 Parent(s): 39937a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -81,7 +81,7 @@ def flush_shard(lang, rows, state):
81
  tok_in_shard = sum(r["token_count"] for r in rows)
82
  state["lang_shards"][lang] = shard_idx + 1
83
  state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
84
- # print(f" βœ“ {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
85
 
86
  # ── Main processing loop ─────────────────────────────────────────────────────
87
  def process(tokenizer, state):
@@ -91,7 +91,7 @@ def process(tokenizer, state):
91
  continue
92
 
93
  url = PARQUET_URL.format(i=i)
94
- #print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
95
 
96
  try:
97
  resp = requests.get(
@@ -146,7 +146,7 @@ def process(tokenizer, state):
146
 
147
  state["done"].append(i)
148
  save_state(state)
149
- #print(f"[{i:05d}] βœ“ Complete")
150
 
151
  # ── Flush remaining partial shards ────────────────────────────────────────
152
  print("\nFlushing remaining buffers...")
 
81
  tok_in_shard = sum(r["token_count"] for r in rows)
82
  state["lang_shards"][lang] = shard_idx + 1
83
  state["lang_tokens"][lang] = state["lang_tokens"].get(lang, 0) + tok_in_shard
84
+ print(f" βœ“ {lang}/{shard_name} | {len(rows)} samples | {tok_in_shard:,} tokens")
85
 
86
  # ── Main processing loop ─────────────────────────────────────────────────────
87
  def process(tokenizer, state):
 
91
  continue
92
 
93
  url = PARQUET_URL.format(i=i)
94
+ print(f"[{i:05d}/{TOTAL_PARQUETS}] Downloading...")
95
 
96
  try:
97
  resp = requests.get(
 
146
 
147
  state["done"].append(i)
148
  save_state(state)
149
+ print(f"[{i:05d}] βœ“ Complete")
150
 
151
  # ── Flush remaining partial shards ────────────────────────────────────────
152
  print("\nFlushing remaining buffers...")