Spaces:
Sleeping
Sleeping
| import os | |
| import lzma | |
| from tqdm import tqdm | |
| def xz_files_in_dir(directory): | |
| files = [] | |
| for filename in os.listdir(directory): | |
| if filename.endswith(".xz") and os.path.isfile( | |
| os.path.join(directory, filename) | |
| ): | |
| files.append(filename) | |
| return files | |
| tarxz_path = "./openwebtext.tar.xz" | |
| folder_path = "./openwebtext" | |
| output_file_train = "./openwebtext/train_split.txt" | |
| output_file_val = "./openwebtext/val_split.txt" | |
| vocab_file = "./vocab.txt" | |
| if not os.path.exists(tarxz_path): | |
| print("Please download the openwebtext.tar.xz file from:") | |
| print("https://skylion007.github.io/OpenWebTextCorpus/") | |
| exit() | |
| # Extract the tar.xz file | |
| if not os.path.exists(folder_path): | |
| os.mkdir(folder_path) | |
| os.system(f"tar -xvf {tarxz_path}") | |
| files = xz_files_in_dir(folder_path) | |
| total_files = len(files) | |
| # Calculate the split indices | |
| split_index = int(total_files * 0.9) # 90% for training | |
| files_train = files[:split_index] | |
| files_val = files[split_index:] | |
| # Process the files for training and validation separately | |
| vocab = set() | |
| # Process the training files | |
| if not os.path.exists(output_file_train): | |
| with open(output_file_train, "w", encoding="utf-8") as outfile: | |
| for filename in tqdm(files_train, total=len(files_train)): | |
| file_path = os.path.join(folder_path, filename) | |
| with lzma.open(file_path, "rt", encoding="utf-8") as infile: | |
| text = infile.read() | |
| outfile.write(text) | |
| characters = set(text) | |
| vocab.update(characters) | |
| # Process the validation files | |
| if not os.path.exists(output_file_val): | |
| with open(output_file_val, "w", encoding="utf-8") as outfile: | |
| for filename in tqdm(files_val, total=len(files_val)): | |
| file_path = os.path.join(folder_path, filename) | |
| with lzma.open(file_path, "rt", encoding="utf-8") as infile: | |
| text = infile.read() | |
| outfile.write(text) | |
| characters = set(text) | |
| vocab.update(characters) | |
| # Write the vocabulary to vocab.txt | |
| if not os.path.exists(vocab_file): | |
| with open(vocab_file, "w", encoding="utf-8") as vfile: | |
| for char in vocab: | |
| vfile.write(char + "\n") | |