import os INPUT_FOLDER = "/home/ubuntu/output" # path to your processed wiki dump folder OUTPUT_FILE = "wiki_texts.txt" with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f: # walk through all directories and files recursively for root, dirs, files in os.walk(INPUT_FOLDER): for filename in sorted(files): filepath = os.path.join(root, filename) with open(filepath, "r", encoding="utf-8") as f: for line in f: text = line.strip().replace("\n", " ") if text: # skip empty lines out_f.write(text + "\n") print(f"✅ wiki_texts.txt created with all documents from {INPUT_FOLDER}!")