Spaces:
Sleeping
Sleeping
| import os | |
| def split_wiki(): | |
| input_file = "wikitop10.txt" | |
| output_dir = "tests/test_data/wiki_top10" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Pre-defined start lines (1-indexed) | |
| boundaries = [ | |
| (1, "United Kingdom"), | |
| (108, "Physics"), | |
| (232, "Chemistry"), | |
| (337, "United States"), | |
| (497, "Science"), | |
| (693, "Isaac Newton"), | |
| (729, "Leonardo da Vinci"), | |
| (813, "Evolution"), | |
| (1045, "Albert Einstein"), | |
| (1156, "India") | |
| ] | |
| with open(input_file, 'r', encoding='utf-8', errors='ignore') as f: | |
| lines = f.readlines() | |
| for i in range(len(boundaries)): | |
| start_idx = boundaries[i][0] - 1 | |
| title = boundaries[i][1].replace(" ", "_").lower() | |
| if i < len(boundaries) - 1: | |
| end_idx = boundaries[i+1][0] - 1 | |
| else: | |
| end_idx = len(lines) | |
| article_content = "".join(lines[start_idx:end_idx]).strip() | |
| filename = f"{title}.txt" | |
| filepath = os.path.join(output_dir, filename) | |
| with open(filepath, 'w', encoding='utf-8') as out: | |
| out.write(article_content) | |
| print(f"Created {filename} ({len(article_content)} chars)") | |
| if __name__ == "__main__": | |
| split_wiki() | |