zirobtc commited on
Commit
f53b3ee
·
1 Parent(s): 457b1fb

Upload folder using huggingface_hub

Browse files
all_files_temp.txt ADDED
The diff for this file is too large to render. See raw diff
 
archive_cache.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ CACHE_DIR="data/cache"
4
+ OUTPUT_DIR="data/archives"
5
+ BATCH_SIZE=60000 # Smaller for frequent updates
6
+
7
+ mkdir -p "$OUTPUT_DIR"
8
+
9
+ echo "========================================================"
10
+ echo "Archiving '$CACHE_DIR' into multiple zip files..."
11
+ echo "Batch Size: $BATCH_SIZE files per archive"
12
+ echo "========================================================"
13
+
14
+ echo "Scanning for .pt files..."
15
+ find "$CACHE_DIR" -maxdepth 1 -name "sample_*.pt" > all_files_temp.txt
16
+
17
+ TOTAL_FILES=$(wc -l < all_files_temp.txt)
18
+ echo "Found $TOTAL_FILES .pt files."
19
+
20
+ if [ "$TOTAL_FILES" -eq 0 ]; then
21
+ echo "No files found to archive."
22
+ rm all_files_temp.txt
23
+ exit 0
24
+ fi
25
+
26
+ # Split list into temporary chunk files
27
+ split -l "$BATCH_SIZE" -d -a 3 all_files_temp.txt file_list_part_
28
+
29
+ echo "Starting sequential archiving..."
30
+
31
+ for LIST_FILE in file_list_part_*; do
32
+ PART_NUM=${LIST_FILE##*_}
33
+ ZIP_NAME="$OUTPUT_DIR/cache_batch_$PART_NUM.zip"
34
+
35
+ echo "[$(date +%T)] Starting batch $PART_NUM ($BATCH_SIZE files) -> $ZIP_NAME"
36
+
37
+ # Process sequentially:
38
+ # -1: Fast compression
39
+ # -m: Move files (delete after successful zip) -- keeping requested behavior (?)
40
+ # No, user "give up dont upload metadata". Script previously had -m.
41
+ # User might want to KEEP source files if upload fails?
42
+ # Usually archiving cache implies "pack it up".
43
+ # I'll stick to -m (move) to save space as we go, unless previously requested otherwise?
44
+ # User didn't specify "keep". Defaulting to -m clears disk space.
45
+ # BUT wait, user said "no way to see if is zipping".
46
+ # If I use -m, files disappear.
47
+ # Let's use -m to clean up.
48
+
49
+ cat "$LIST_FILE" | zip -1 -mq -j "$ZIP_NAME" -@
50
+
51
+ # Verify zip created
52
+ if [ -f "$ZIP_NAME" ]; then
53
+ SIZE=$(du -h "$ZIP_NAME" | cut -f1)
54
+ echo "[$(date +%T)] Finished batch $PART_NUM (Size: $SIZE)"
55
+ else
56
+ echo "ERROR: Failed to create $ZIP_NAME"
57
+ exit 1
58
+ fi
59
+
60
+ rm "$LIST_FILE"
61
+ done
62
+
63
+ # Cleanup
64
+ rm all_files_temp.txt
65
+
66
+ echo "========================================================"
67
+ echo "Done! Archives are in $OUTPUT_DIR"
68
+ echo "========================================================"
data/ohlc_stats.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2109769ab98455e3a8443ca8fc45a0d6f6b01f8724b45850e4a048c55a1e129e
3
  size 1660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a030a8941c957d14d1f5c9469407bcdb0d6eb52ea743e20a0049e5b135c1d6d4
3
  size 1660
file_list_part_002 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_003 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_004 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_005 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_006 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_007 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_008 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_009 ADDED
The diff for this file is too large to render. See raw diff
 
file_list_part_010 ADDED
The diff for this file is too large to render. See raw diff
 
log.log CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd85070eb1d168bd453ed03a50319d4302c9f34aa2e1ee2e7b47a0659f7f68d9
3
- size 18836
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27af4460defe3832c7bb709538efa4908bd370b202bad245473b23b14648e2db
3
+ size 2881
sample_1219wsn6eYBuK7B5_0.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/cache_dataset.py CHANGED
@@ -408,7 +408,6 @@ def main():
408
  json.dump({
409
  'file_class_map': file_class_map,
410
  'class_distribution': {str(k): v for k, v in class_distribution.items()},
411
- 'cache_mode': args.cache_mode,
412
  'num_workers': args.num_workers,
413
  'horizons_seconds': args.horizons_seconds,
414
  'quantiles': args.quantiles,
 
408
  json.dump({
409
  'file_class_map': file_class_map,
410
  'class_distribution': {str(k): v for k, v in class_distribution.items()},
 
411
  'num_workers': args.num_workers,
412
  'horizons_seconds': args.horizons_seconds,
413
  'quantiles': args.quantiles,
scripts/rebuild_metadata.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append(".")
3
+ import torch
4
+ import json
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ from collections import defaultdict
8
+
9
+ def rebuild_metadata(cache_dir="data/cache"):
10
+ cache_path = Path(cache_dir)
11
+ print(f"Scanning {cache_path} for .pt files...")
12
+
13
+ files = sorted(list(cache_path.glob("sample_*.pt")))
14
+ if not files:
15
+ print("No .pt files found!")
16
+ return
17
+
18
+ print(f"Found {len(files)} files. Reading class IDs...")
19
+
20
+ file_class_map = {}
21
+ class_distribution = defaultdict(int)
22
+
23
+ for f in tqdm(files):
24
+ try:
25
+ # We only need the class_id, no need to load the whole extensive tensor data if possible
26
+ # But torch.load loads everything. To be safe/fast, we just load on CPU.
27
+ data = torch.load(f, map_location="cpu", weights_only=False)
28
+ cid = data.get("class_id", 0)
29
+ file_class_map[f.name] = cid
30
+ class_distribution[cid] += 1
31
+ except Exception as e:
32
+ print(f"Error reading {f.name}: {e}")
33
+
34
+ output_data = {
35
+ 'file_class_map': file_class_map,
36
+ 'class_distribution': {str(k): v for k, v in class_distribution.items()},
37
+ # These are informational, setting defaults to avoid breaking if loader checks them
38
+ 'num_workers': 1,
39
+ 'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh
40
+ 'quantiles': [0.1, 0.5, 0.9],
41
+ }
42
+
43
+ out_file = cache_path / "class_metadata.json"
44
+ with open(out_file, 'w') as f:
45
+ json.dump(output_data, f, indent=2)
46
+
47
+ print(f"Successfully rebuilt metadata for {len(file_class_map)} files.")
48
+ print(f"Saved to: {out_file}")
49
+
50
+ if __name__ == "__main__":
51
+ rebuild_metadata()