| import os | |
| INPUT_FOLDER = "/home/ubuntu/output" # path to your processed wiki dump folder | |
| OUTPUT_FILE = "wiki_texts.txt" | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f: | |
| # walk through all directories and files recursively | |
| for root, dirs, files in os.walk(INPUT_FOLDER): | |
| for filename in sorted(files): | |
| filepath = os.path.join(root, filename) | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| for line in f: | |
| text = line.strip().replace("\n", " ") | |
| if text: # skip empty lines | |
| out_f.write(text + "\n") | |
| print(f"✅ wiki_texts.txt created with all documents from {INPUT_FOLDER}!") | |