Charlie81 commited on
Commit
3db4e2e
·
1 Parent(s): dd2e997

claude attempt 2 dataset

Browse files
Files changed (1) hide show
  1. scripts/train.py +55 -19
scripts/train.py CHANGED
@@ -11,31 +11,67 @@ from datasets import load_dataset
11
  from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
12
  import os
13
 
 
 
14
  import datasets
 
15
 
16
- # Clear the cache
17
- datasets.disable_caching()
 
18
 
19
- # Or manually clear cache
20
- import shutil
21
- import os
22
- cache_dir = os.path.expanduser("~/.cache/huggingface/datasets")
23
- if os.path.exists(cache_dir):
24
- shutil.rmtree(cache_dir)
25
 
26
- #
27
- import datasets
28
- import os
29
 
30
- # Check current cache directory
31
- print("Current cache directory:", datasets.config.HF_DATASETS_CACHE)
32
- print("Cache exists:", os.path.exists(datasets.config.HF_DATASETS_CACHE))
33
 
34
- # List contents if it exists
35
- if os.path.exists(datasets.config.HF_DATASETS_CACHE):
36
- print("Cache contents:")
37
- for item in os.listdir(datasets.config.HF_DATASETS_CACHE):
38
- print(f" {item}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  #
40
  def main():
41
  print("Starting Training Script for my cool OLMoE")
 
11
  from myolmoe import MyOlmoeForCausalLM, OlmoeConfig
12
  import os
13
 
14
+ import os
15
+ import tempfile
16
  import datasets
17
+ from datasets import load_dataset
18
 
19
+ # Create a completely isolated cache directory
20
+ temp_dir = tempfile.mkdtemp()
21
+ print(f"Using temporary cache directory: {temp_dir}")
22
 
23
+ # Set environment variable to override cache location
24
+ os.environ['HF_DATASETS_CACHE'] = temp_dir
 
 
 
 
25
 
26
+ # Also set HF_HOME to ensure clean state
27
+ os.environ['HF_HOME'] = temp_dir
 
28
 
29
+ # Disable caching entirely
30
+ datasets.disable_caching()
 
31
 
32
+ # Now try loading
33
+ try:
34
+ dataset = load_dataset(
35
+ "allenai/tulu-v2-sft-mixture",
36
+ split="train",
37
+ cache_dir=temp_dir,
38
+ download_mode="force_redownload"
39
+ )
40
+ print(f"SUCCESS! Loaded {len(dataset)} examples")
41
+ except Exception as e:
42
+ print(f"Still failing: {e}")
43
+ print("Let's try streaming approach...")
44
+
45
+ # Try streaming instead
46
+ try:
47
+ dataset = load_dataset(
48
+ "allenai/tulu-v2-sft-mixture",
49
+ split="train",
50
+ streaming=True
51
+ )
52
+ print("Streaming dataset loaded successfully!")
53
+
54
+ # Convert streaming to regular dataset (sample first 1000 for testing)
55
+ dataset_list = []
56
+ for i, example in enumerate(dataset):
57
+ dataset_list.append(example)
58
+ if i >= 1000: # Just for testing
59
+ break
60
+
61
+ from datasets import Dataset
62
+ dataset = Dataset.from_list(dataset_list)
63
+ print(f"Converted to regular dataset with {len(dataset)} examples")
64
+
65
+ except Exception as e2:
66
+ print(f"Streaming also failed: {e2}")
67
+ print("Let's try a different dataset temporarily...")
68
+
69
+ # Try a smaller, simpler dataset first
70
+ try:
71
+ test_dataset = load_dataset("imdb", split="train[:100]")
72
+ print("Simple dataset works - issue is specific to tulu-v2-sft-mixture")
73
+ except Exception as e3:
74
+ print(f"Even simple dataset fails: {e3}")
75
  #
76
  def main():
77
  print("Starting Training Script for my cool OLMoE")