File size: 706 Bytes
1026698 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | import os
INPUT_FOLDER = "/home/ubuntu/output" # path to your processed wiki dump folder
OUTPUT_FILE = "wiki_texts.txt"
with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
# walk through all directories and files recursively
for root, dirs, files in os.walk(INPUT_FOLDER):
for filename in sorted(files):
filepath = os.path.join(root, filename)
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
text = line.strip().replace("\n", " ")
if text: # skip empty lines
out_f.write(text + "\n")
print(f"✅ wiki_texts.txt created with all documents from {INPUT_FOLDER}!")
|