File size: 1,410 Bytes
a8639ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
import os
import tqdm
# get through all files in inputted path, put the first 80% in one file concatenated and the last 20% in another
def split_files(input_path, output_path1, output_path2):
files = [
f
for f in os.listdir(input_path)
if os.path.isfile(os.path.join(input_path, f)) and f.endswith(".py")
]
# Sort files to maintain order
files.sort()
split_index = int(len(files) * 0.8)
first_80_files = files[:split_index]
last_20_files = files[split_index:]
print(os.listdir(input_path))
with open(output_path1, "w") as outfile1:
for fname in tqdm.tqdm(first_80_files):
with open(os.path.join(input_path, fname), errors="ignore") as infile:
outfile1.write(infile.read())
outfile1.write("\nprint('---FILESEP---')\n")
with open(output_path2, "w") as outfile2:
for fname in tqdm.tqdm(last_20_files):
with open(os.path.join(input_path, fname), errors="ignore") as infile:
outfile2.write(infile.read())
outfile2.write("\nprint('---FILESEP---')\n")
# Example usage
input_path = os.path.expanduser("~/torch_datasets/github-python/all_trains")
output_path1 = os.path.expanduser("~/torch_datasets/github-python/80")
output_path2 = os.path.expanduser("~/torch_datasets/github-python/20")
split_files(input_path, output_path1, output_path2)
|