import os def split_wiki(): input_file = "wikitop10.txt" output_dir = "tests/test_data/wiki_top10" os.makedirs(output_dir, exist_ok=True) # Pre-defined start lines (1-indexed) boundaries = [ (1, "United Kingdom"), (108, "Physics"), (232, "Chemistry"), (337, "United States"), (497, "Science"), (693, "Isaac Newton"), (729, "Leonardo da Vinci"), (813, "Evolution"), (1045, "Albert Einstein"), (1156, "India") ] with open(input_file, 'r', encoding='utf-8', errors='ignore') as f: lines = f.readlines() for i in range(len(boundaries)): start_idx = boundaries[i][0] - 1 title = boundaries[i][1].replace(" ", "_").lower() if i < len(boundaries) - 1: end_idx = boundaries[i+1][0] - 1 else: end_idx = len(lines) article_content = "".join(lines[start_idx:end_idx]).strip() filename = f"{title}.txt" filepath = os.path.join(output_dir, filename) with open(filepath, 'w', encoding='utf-8') as out: out.write(article_content) print(f"Created {filename} ({len(article_content)} chars)") if __name__ == "__main__": split_wiki()