Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

all_files_temp.txt +0 -0
archive_cache.sh +68 -0
data/ohlc_stats.npz +1 -1
file_list_part_002 +0 -0
file_list_part_003 +0 -0
file_list_part_004 +0 -0
file_list_part_005 +0 -0
file_list_part_006 +0 -0
file_list_part_007 +0 -0
file_list_part_008 +0 -0
file_list_part_009 +0 -0
file_list_part_010 +0 -0
log.log +2 -2
sample_1219wsn6eYBuK7B5_0.json +0 -0
scripts/cache_dataset.py +0 -1
scripts/rebuild_metadata.py +51 -0

all_files_temp.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

archive_cache.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+CACHE_DIR="data/cache"
+OUTPUT_DIR="data/archives"
+BATCH_SIZE=60000  # Smaller for frequent updates
+mkdir -p "$OUTPUT_DIR"
+echo "========================================================"
+echo "Archiving '$CACHE_DIR' into multiple zip files..."
+echo "Batch Size: $BATCH_SIZE files per archive"
+echo "========================================================"
+echo "Scanning for .pt files..."
+find "$CACHE_DIR" -maxdepth 1 -name "sample_*.pt" > all_files_temp.txt
+TOTAL_FILES=$(wc -l < all_files_temp.txt)
+echo "Found $TOTAL_FILES .pt files."
+if [ "$TOTAL_FILES" -eq 0 ]; then
+    echo "No files found to archive."
+    rm all_files_temp.txt
+    exit 0
+fi
+# Split list into temporary chunk files
+split -l "$BATCH_SIZE" -d -a 3 all_files_temp.txt file_list_part_
+echo "Starting sequential archiving..."
+for LIST_FILE in file_list_part_*; do
+    PART_NUM=${LIST_FILE##*_}
+    ZIP_NAME="$OUTPUT_DIR/cache_batch_$PART_NUM.zip"
+    echo "[$(date +%T)] Starting batch $PART_NUM ($BATCH_SIZE files) -> $ZIP_NAME"
+    # Process sequentially:
+    # -1: Fast compression
+    # -m: Move files (delete after successful zip) -- keeping requested behavior (?)
+    # No, user "give up dont upload metadata". Script previously had -m.
+    # User might want to KEEP source files if upload fails?
+    # Usually archiving cache implies "pack it up".
+    # I'll stick to -m (move) to save space as we go, unless previously requested otherwise?
+    # User didn't specify "keep". Defaulting to -m clears disk space.
+    # BUT wait, user said "no way to see if is zipping".
+    # If I use -m, files disappear.
+    # Let's use -m to clean up.
+    cat "$LIST_FILE" | zip -1 -mq -j "$ZIP_NAME" -@
+    # Verify zip created
+    if [ -f "$ZIP_NAME" ]; then
+        SIZE=$(du -h "$ZIP_NAME" | cut -f1)
+        echo "[$(date +%T)] Finished batch $PART_NUM (Size: $SIZE)"
+    else
+        echo "ERROR: Failed to create $ZIP_NAME"
+        exit 1
+    fi
+    rm "$LIST_FILE"
+done
+# Cleanup
+rm all_files_temp.txt
+echo "========================================================"
+echo "Done! Archives are in $OUTPUT_DIR"
+echo "========================================================"

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2109769ab98455e3a8443ca8fc45a0d6f6b01f8724b45850e4a048c55a1e129e
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:a030a8941c957d14d1f5c9469407bcdb0d6eb52ea743e20a0049e5b135c1d6d4
 size 1660

file_list_part_002 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_003 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_004 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_005 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_006 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_007 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_008 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_009 ADDED Viewed

The diff for this file is too large to render. See raw diff

file_list_part_010 ADDED Viewed

The diff for this file is too large to render. See raw diff

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd85070eb1d168bd453ed03a50319d4302c9f34aa2e1ee2e7b47a0659f7f68d9
-size 18836

 version https://git-lfs.github.com/spec/v1
+oid sha256:27af4460defe3832c7bb709538efa4908bd370b202bad245473b23b14648e2db
+size 2881

sample_1219wsn6eYBuK7B5_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/cache_dataset.py CHANGED Viewed

@@ -408,7 +408,6 @@ def main():
             json.dump({
                 'file_class_map': file_class_map,
                 'class_distribution': {str(k): v for k, v in class_distribution.items()},
-                'cache_mode': args.cache_mode,
                 'num_workers': args.num_workers,
                 'horizons_seconds': args.horizons_seconds,
                 'quantiles': args.quantiles,

             json.dump({
                 'file_class_map': file_class_map,
                 'class_distribution': {str(k): v for k, v in class_distribution.items()},
                 'num_workers': args.num_workers,
                 'horizons_seconds': args.horizons_seconds,
                 'quantiles': args.quantiles,

scripts/rebuild_metadata.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import sys
+sys.path.append(".")
+import torch
+import json
+from pathlib import Path
+from tqdm import tqdm
+from collections import defaultdict
+def rebuild_metadata(cache_dir="data/cache"):
+    cache_path = Path(cache_dir)
+    print(f"Scanning {cache_path} for .pt files...")
+    files = sorted(list(cache_path.glob("sample_*.pt")))
+    if not files:
+        print("No .pt files found!")
+        return
+    print(f"Found {len(files)} files. Reading class IDs...")
+    file_class_map = {}
+    class_distribution = defaultdict(int)
+    for f in tqdm(files):
+        try:
+            # We only need the class_id, no need to load the whole extensive tensor data if possible
+            # But torch.load loads everything. To be safe/fast, we just load on CPU.
+            data = torch.load(f, map_location="cpu", weights_only=False)
+            cid = data.get("class_id", 0)
+            file_class_map[f.name] = cid
+            class_distribution[cid] += 1
+        except Exception as e:
+            print(f"Error reading {f.name}: {e}")
+    output_data = {
+        'file_class_map': file_class_map,
+        'class_distribution': {str(k): v for k, v in class_distribution.items()},
+        # These are informational, setting defaults to avoid breaking if loader checks them
+        'num_workers': 1,
+        'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh
+        'quantiles': [0.1, 0.5, 0.9],
+    }
+    out_file = cache_path / "class_metadata.json"
+    with open(out_file, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    print(f"Successfully rebuilt metadata for {len(file_class_map)} files.")
+    print(f"Saved to: {out_file}")
+if __name__ == "__main__":
+    rebuild_metadata()