import os import json import re # Command to extract: # python WikiExtractor.py ../data/hiwiki-20200220-pages-articles.xml.bz2 --json -de gallery,timeline,noinclude --filter_disambig_pages def GenSentVocab(): folders = os.listdir('text') vocab = {} out_file = open('sentences.txt','a') for fldr in folders: fldr_pth = os.path.join('text',fldr) files = os.listdir(fldr_pth) for fl in files: fl_pth = os.path.join(fldr_pth,fl) with open(fl_pth,'r') as f: lines = f.readlines() for line in lines: article = json.loads(line) text = article['text'].replace('॥',' ॥\n') text = article['text'].replace('।',' ।\n') text = article['text'].replace('.',' .\n') for s in ['\"',"\'",'?','.',',','(',')','[',']','{','}','-','|',';',':','/','\\','=','’','—','‘','`','!','@','$','~','&','^','%','“','”','+','*','।']: text = text.replace(s,' '+s+' ') ref_tag = re.compile(r'.*') text = re.sub(ref_tag,'',text) ref_tag = re.compile(r'<.*?>') text = re.sub(ref_tag,'',text) text = re.sub(' +',' ',text) text = re.sub(r"\n+",'\n',text) text = re.sub(r"[A-Z|a-z]",'',text) text = re.sub(r"[0-9]+",' \1 ',text) sentences = re.split(r"\n",text) for sent in sentences: sentence = sent.strip() if sentence == '': continue out_file.write(sentence+'\n') tokens = sentence.split() for tkn in tokens: if tkn not in vocab: vocab[tkn] = 1 else: vocab[tkn] += 1 out_file.close() with open('vocabulary.txt','a') as voc: for w in vocab: voc.write(w+'\n') voc.write(''+'\n') if __name__=='__main__': GenSentVocab()