Spaces:
Sleeping
Sleeping
Commit
·
36bf517
1
Parent(s):
2ef951d
update extract script
Browse files- extract.py +5 -0
extract.py
CHANGED
|
@@ -15,6 +15,11 @@ output_file_train = "./openwebtext/train_split.txt"
|
|
| 15 |
output_file_val = "./openwebtext/val_split.txt"
|
| 16 |
vocab_file = "./openwebtext/vocab.txt"
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Extract the tar.xz file
|
| 19 |
if not os.path.exists(folder_path):
|
| 20 |
os.mkdir(folder_path)
|
|
|
|
| 15 |
output_file_val = "./openwebtext/val_split.txt"
|
| 16 |
vocab_file = "./openwebtext/vocab.txt"
|
| 17 |
|
| 18 |
+
if not os.path.exists(tarxz_path):
|
| 19 |
+
print("Please download the openwebtext.tar.xz file from:")
|
| 20 |
+
print("https://skylion007.github.io/OpenWebTextCorpus/")
|
| 21 |
+
exit()
|
| 22 |
+
|
| 23 |
# Extract the tar.xz file
|
| 24 |
if not os.path.exists(folder_path):
|
| 25 |
os.mkdir(folder_path)
|