Spaces:

ibrahimmkhalid
/

llm-from-scratch

Sleeping

App Files Files Community

llm-from-scratch / extract.py

ibrahimmkhalid

update extract script

36bf517 almost 2 years ago

raw

history blame

2.25 kB

	import os
	import lzma
	from tqdm import tqdm

	def xz_files_in_dir(directory):
	files = []
	for filename in os.listdir(directory):
	if filename.endswith(".xz") and os.path.isfile(os.path.join(directory, filename)):
	files.append(filename)
	return files

	tarxz_path = "./openwebtext.tar.xz"
	folder_path = "./openwebtext"
	output_file_train = "./openwebtext/train_split.txt"
	output_file_val = "./openwebtext/val_split.txt"
	vocab_file = "./openwebtext/vocab.txt"

	if not os.path.exists(tarxz_path):
	print("Please download the openwebtext.tar.xz file from:")
	print("https://skylion007.github.io/OpenWebTextCorpus/")
	exit()

	# Extract the tar.xz file
	if not os.path.exists(folder_path):
	os.mkdir(folder_path)
	os.system(f"tar -xvf {tarxz_path}")

	files = xz_files_in_dir(folder_path)
	total_files = len(files)

	# Calculate the split indices
	split_index = int(total_files * 0.9) # 90% for training
	files_train = files[:split_index]
	files_val = files[split_index:]

	# Process the files for training and validation separately
	vocab = set()

	# Process the training files
	if not os.path.exists(output_file_train):
	with open(output_file_train, "w", encoding="utf-8") as outfile:
	for filename in tqdm(files_train, total=len(files_train)):
	file_path = os.path.join(folder_path, filename)
	with lzma.open(file_path, "rt", encoding="utf-8") as infile:
	text = infile.read()
	outfile.write(text)
	characters = set(text)
	vocab.update(characters)

	# Process the validation files
	if not os.path.exists(output_file_val):
	with open(output_file_val, "w", encoding="utf-8") as outfile:
	for filename in tqdm(files_val, total=len(files_val)):
	file_path = os.path.join(folder_path, filename)
	with lzma.open(file_path, "rt", encoding="utf-8") as infile:
	text = infile.read()
	outfile.write(text)
	characters = set(text)
	vocab.update(characters)

	# Write the vocabulary to vocab.txt
	if not os.path.exists(vocab_file):
	with open(vocab_file, "w", encoding="utf-8") as vfile:
	for char in vocab:
	vfile.write(char + '\n')