ikaganacar commited on
Commit
4c8c009
·
1 Parent(s): ff6a18f
Files changed (1) hide show
  1. Model_Architecture/data/get_data.py +24 -15
Model_Architecture/data/get_data.py CHANGED
@@ -27,6 +27,7 @@ def download_and_prepare_data(
27
  train_ratio: float = 0.90,
28
  seed: int = 2357,
29
  max_samples: int = None,
 
30
  ):
31
 
32
  data_dir = Path(data_dir)
@@ -45,25 +46,27 @@ def download_and_prepare_data(
45
  else:
46
  # Try to find cached parquet files first
47
  import os
48
- cache_dir = Path.home() / ".cache/huggingface/datasets/downloads"
49
 
50
  print(f" Looking for cached parquet files...")
51
  parquet_files = []
52
 
53
- if cache_dir.exists():
54
- # Find all parquet files in cache
55
- for root, dirs, files in os.walk(cache_dir):
56
- for file in files:
57
- if file.endswith('.parquet') and 'tr_part' in file:
58
- parquet_files.append(os.path.join(root, file))
59
-
60
- # Also check for extracted parquet files in the main cache directory
61
- culturax_cache = Path.home() / ".cache/huggingface/datasets/uonlp___cultura_x"
62
- if culturax_cache.exists():
63
- for root, dirs, files in os.walk(culturax_cache):
64
- for file in files:
65
- if file.endswith('.parquet'):
66
- parquet_files.append(os.path.join(root, file))
 
 
 
67
 
68
  if parquet_files:
69
  # Use the cached parquet files
@@ -241,6 +244,11 @@ def main():
241
  type=int,
242
  help="Maximum number of samples to process (for testing)"
243
  )
 
 
 
 
 
244
 
245
  args = parser.parse_args()
246
 
@@ -262,6 +270,7 @@ def main():
262
  train_ratio=train_ratio,
263
  seed=args.seed,
264
  max_samples=args.max_samples,
 
265
  )
266
 
267
 
 
27
  train_ratio: float = 0.90,
28
  seed: int = 2357,
29
  max_samples: int = None,
30
+ cache_dir: str = None,
31
  ):
32
 
33
  data_dir = Path(data_dir)
 
46
  else:
47
  # Try to find cached parquet files first
48
  import os
 
49
 
50
  print(f" Looking for cached parquet files...")
51
  parquet_files = []
52
 
53
+ # Use custom cache directory if provided, otherwise use default
54
+ if cache_dir:
55
+ search_paths = [Path(cache_dir)]
56
+ else:
57
+ search_paths = [
58
+ Path.home() / ".cache/huggingface/datasets/downloads",
59
+ Path.home() / ".cache/huggingface/datasets/uonlp___cultura_x",
60
+ ]
61
+
62
+ for search_path in search_paths:
63
+ if search_path.exists():
64
+ print(f" Searching in: {search_path}")
65
+ # Find all parquet files in cache
66
+ for root, dirs, files in os.walk(search_path):
67
+ for file in files:
68
+ if file.endswith('.parquet'):
69
+ parquet_files.append(os.path.join(root, file))
70
 
71
  if parquet_files:
72
  # Use the cached parquet files
 
244
  type=int,
245
  help="Maximum number of samples to process (for testing)"
246
  )
247
+ parser.add_argument(
248
+ "--cache_dir",
249
+ type=str,
250
+ help="Custom cache directory path where parquet files are located"
251
+ )
252
 
253
  args = parser.parse_args()
254
 
 
270
  train_ratio=train_ratio,
271
  seed=args.seed,
272
  max_samples=args.max_samples,
273
+ cache_dir=args.cache_dir,
274
  )
275
 
276