zirobtc
/

oracle

Model card Files Files and versions

xet

Community

zirobtc commited on Feb 6

Commit

fdd6265

1 Parent(s): 3a4bd03

Upload scripts/cache_dataset.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/cache_dataset.py +47 -3

scripts/cache_dataset.py CHANGED Viewed

@@ -282,10 +282,33 @@ def main():
         )
         print(f"INFO: Total tasks: {len(tasks)} (expected ~{expected_files} output files, target ~{target_total})")
-        print(f"INFO: Starting to cache {len(tasks)} tokens...")
         success_count, skipped_count, error_count = 0, 0, 0
         class_distribution = {}
         process_fn = _process_single_token_context if args.cache_mode == "context" else _process_single_token_raw
         import time as _time
@@ -307,6 +330,10 @@ def main():
                     f"OK: {success_count} Skip: {skipped_count} Err: {error_count}"
                 )
         if args.num_workers == 1:
             print("INFO: Single-threaded mode...")
             _init_worker(db_config, dataset_config, return_class_map, quality_scores_map)
@@ -326,7 +353,10 @@ def main():
                     skipped_count += 1
                 else:
                     error_count += 1
-                    tqdm.write(f"ERROR: {result['mint'][:16]} - {result['error']}")
                 _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
         else:
             print(f"INFO: Running with {args.num_workers} workers...")
@@ -349,11 +379,25 @@ def main():
                             skipped_count += 1
                         else:
                             error_count += 1
                     except Exception as e:
                         error_count += 1
                         tqdm.write(f"WORKER ERROR: {e}")
                     _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
         print("INFO: Building metadata...")
         file_class_map = {}
         for f in sorted(output_dir.glob("sample_*.pt")):

         )
         print(f"INFO: Total tasks: {len(tasks)} (expected ~{expected_files} output files, target ~{target_total})")
         success_count, skipped_count, error_count = 0, 0, 0
         class_distribution = {}
+        # --- Resume support: skip tokens that already have cached files ---
+        existing_files = set(f.name for f in output_dir.glob("sample_*.pt"))
+        if existing_files:
+            pre_resume = len(tasks)
+            filtered_tasks = []
+            already_cached = 0
+            for task in tasks:
+                mint_addr = task[1]  # task = (idx, mint_addr, ...)
+                # Check if any file exists for this mint (context mode: sample_MINT_0.pt, raw mode: sample_MINT.pt)
+                mint_prefix = f"sample_{mint_addr[:16]}"
+                has_cached = any(ef.startswith(mint_prefix) for ef in existing_files)
+                if has_cached:
+                    already_cached += 1
+                    # Count existing files toward class distribution
+                    cid = return_class_map.get(mint_addr)
+                    if cid is not None:
+                        class_distribution[cid] = class_distribution.get(cid, 0) + 1
+                    success_count += 1
+                else:
+                    filtered_tasks.append(task)
+            tasks = filtered_tasks
+            print(f"INFO: Resume: {already_cached} tokens already cached, {len(tasks)} remaining (was {pre_resume})")
+        print(f"INFO: Starting to cache {len(tasks)} tokens...")
         process_fn = _process_single_token_context if args.cache_mode == "context" else _process_single_token_raw
         import time as _time
                     f"OK: {success_count} Skip: {skipped_count} Err: {error_count}"
                 )
+        # Error log file for diagnosing failures
+        error_log_path = Path(args.output_dir) / "cache_errors.log"
+        error_samples = []  # First 20 unique error messages
         if args.num_workers == 1:
             print("INFO: Single-threaded mode...")
             _init_worker(db_config, dataset_config, return_class_map, quality_scores_map)
                     skipped_count += 1
                 else:
                     error_count += 1
+                    err_msg = result.get('error', 'unknown')
+                    tqdm.write(f"ERROR: {result['mint'][:16]} - {err_msg}")
+                    if len(error_samples) < 20:
+                        error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
                 _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
         else:
             print(f"INFO: Running with {args.num_workers} workers...")
                             skipped_count += 1
                         else:
                             error_count += 1
+                            err_msg = result.get('error', 'unknown')
+                            if len(error_samples) < 20:
+                                error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
+                            if error_count <= 5:
+                                tqdm.write(f"ERROR: {result.get('mint', '?')[:16]} - {err_msg}")
                     except Exception as e:
                         error_count += 1
                         tqdm.write(f"WORKER ERROR: {e}")
                     _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+        # Write error log
+        if error_samples:
+            with open(error_log_path, 'w') as ef:
+                for i, es in enumerate(error_samples):
+                    ef.write(f"=== Error {i+1} === Token: {es['mint']}\n")
+                    ef.write(f"Error: {es['error']}\n")
+                    ef.write(f"Traceback:\n{es['traceback']}\n\n")
+            print(f"INFO: First {len(error_samples)} error tracebacks saved to {error_log_path}")
         print("INFO: Building metadata...")
         file_class_map = {}
         for f in sorted(output_dir.glob("sample_*.pt")):