|
|
import xml.etree.ElementTree as ET
|
|
|
import re
|
|
|
from tqdm import tqdm
|
|
|
import os
|
|
|
|
|
|
INPUT_FILE = "data/wikipedia/raw/mk_wiki.xml"
|
|
|
OUTPUT_FILE = "data/wikipedia/processed/mk_wiki_text.txt"
|
|
|
|
|
|
def clean_wiki_text(text):
|
|
|
|
|
|
text = re.sub(r'\{\{[^\}]*\}\}', '', text)
|
|
|
text = re.sub(r'\[\[(?:[^|\]]*\|)?([^\]]+)\]\]', r'\1', text)
|
|
|
text = re.sub(r'==+.*?==+', '', text)
|
|
|
text = re.sub(r'<ref[^>]*>.*?</ref>', '', text)
|
|
|
text = re.sub(r'<!--.*?-->', '', text)
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
return text.strip()
|
|
|
|
|
|
def create_directories():
|
|
|
output_dir = os.path.dirname(OUTPUT_FILE)
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
|
|
|
def parse_wiki_dump():
|
|
|
create_directories()
|
|
|
|
|
|
context = ET.iterparse(INPUT_FILE, events=('end',))
|
|
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
|
for event, elem in tqdm(context):
|
|
|
if elem.tag.endswith('text'):
|
|
|
text = elem.text
|
|
|
if text and len(text) > 100:
|
|
|
cleaned_text = clean_wiki_text(text)
|
|
|
if cleaned_text:
|
|
|
f.write(cleaned_text + '\n\n')
|
|
|
elem.clear()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
parse_wiki_dump() |