English
File size: 706 Bytes
1026698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import os

INPUT_FOLDER = "/home/ubuntu/output"  # path to your processed wiki dump folder
OUTPUT_FILE = "wiki_texts.txt"

with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
    # walk through all directories and files recursively
    for root, dirs, files in os.walk(INPUT_FOLDER):
        for filename in sorted(files):
            filepath = os.path.join(root, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                for line in f:
                    text = line.strip().replace("\n", " ")
                    if text:  # skip empty lines
                        out_f.write(text + "\n")

print(f"✅ wiki_texts.txt created with all documents from {INPUT_FOLDER}!")