gemma-sage / test /split_wiki.py
neuralworm's picture
feat: Agentic Oracle, Multi-Message Bubbles, Filtering, and UI Fixes
8ab43a3
import os
def split_wiki():
input_file = "wikitop10.txt"
output_dir = "tests/test_data/wiki_top10"
os.makedirs(output_dir, exist_ok=True)
# Pre-defined start lines (1-indexed)
boundaries = [
(1, "United Kingdom"),
(108, "Physics"),
(232, "Chemistry"),
(337, "United States"),
(497, "Science"),
(693, "Isaac Newton"),
(729, "Leonardo da Vinci"),
(813, "Evolution"),
(1045, "Albert Einstein"),
(1156, "India")
]
with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
for i in range(len(boundaries)):
start_idx = boundaries[i][0] - 1
title = boundaries[i][1].replace(" ", "_").lower()
if i < len(boundaries) - 1:
end_idx = boundaries[i+1][0] - 1
else:
end_idx = len(lines)
article_content = "".join(lines[start_idx:end_idx]).strip()
filename = f"{title}.txt"
filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as out:
out.write(article_content)
print(f"Created {filename} ({len(article_content)} chars)")
if __name__ == "__main__":
split_wiki()