Rachel Ding commited on
Commit
25c33f6
·
1 Parent(s): 349e3bf

Speed up: cache repo file list; Dasheng dropdown exclude fold*

Browse files
Files changed (1) hide show
  1. dataset_loader.py +17 -5
dataset_loader.py CHANGED
@@ -16,10 +16,21 @@ REPO_TYPE = "dataset"
16
  ROOT_PREFIX = "batch_outputs/"
17
  DASHENG_PREFIX = "batch_outputs_dasheng/"
18
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def _get_sample_ids(prefix: str = ROOT_PREFIX) -> list[str]:
21
  """List sample IDs (e.g. 07_003277) under given prefix in repo."""
22
- files = list_repo_files(REPO_ID, repo_type=REPO_TYPE)
23
  seen = set()
24
  pat = re.escape(prefix.rstrip("/")) + r"/([^/]+)/"
25
  for f in files:
@@ -62,13 +73,14 @@ def list_samples() -> list[str]:
62
 
63
 
64
  def list_samples_dasheng() -> list[str]:
65
- """Return only sample IDs that exist in batch_outputs_dasheng (no fold* from UrbanSound8k)."""
66
- return _get_sample_ids(DASHENG_PREFIX)
 
67
 
68
 
69
  def _find_files(inner: str) -> list[str]:
70
- """List all repo files under inner path."""
71
- files = list_repo_files(REPO_ID, repo_type=REPO_TYPE)
72
  return [f for f in files if f.startswith(inner + "/")]
73
 
74
 
 
16
  ROOT_PREFIX = "batch_outputs/"
17
  DASHENG_PREFIX = "batch_outputs_dasheng/"
18
 
19
+ # Cache full repo file list so we only call list_repo_files once per process (major speedup)
20
+ _cached_repo_files: Optional[list[str]] = None
21
+
22
+
23
+ def _get_repo_files() -> list[str]:
24
+ """Return full list of repo file paths, cached after first call."""
25
+ global _cached_repo_files
26
+ if _cached_repo_files is None:
27
+ _cached_repo_files = list_repo_files(REPO_ID, repo_type=REPO_TYPE)
28
+ return _cached_repo_files
29
+
30
 
31
  def _get_sample_ids(prefix: str = ROOT_PREFIX) -> list[str]:
32
  """List sample IDs (e.g. 07_003277) under given prefix in repo."""
33
+ files = _get_repo_files()
34
  seen = set()
35
  pat = re.escape(prefix.rstrip("/")) + r"/([^/]+)/"
36
  for f in files:
 
73
 
74
 
75
  def list_samples_dasheng() -> list[str]:
76
+ """Return only sample IDs for Dasheng view: from batch_outputs_dasheng, excluding fold* (UrbanSound8k)."""
77
+ ids = _get_sample_ids(DASHENG_PREFIX)
78
+ return sorted([x for x in ids if not x.startswith("fold")])
79
 
80
 
81
  def _find_files(inner: str) -> list[str]:
82
+ """List all repo files under inner path (uses cached repo file list)."""
83
+ files = _get_repo_files()
84
  return [f for f in files if f.startswith(inner + "/")]
85
 
86