|
|
import os |
|
|
import json |
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def GenSentVocab(): |
|
|
folders = os.listdir('text') |
|
|
vocab = {} |
|
|
out_file = open('sentences.txt','a') |
|
|
for fldr in folders: |
|
|
fldr_pth = os.path.join('text',fldr) |
|
|
files = os.listdir(fldr_pth) |
|
|
for fl in files: |
|
|
fl_pth = os.path.join(fldr_pth,fl) |
|
|
with open(fl_pth,'r') as f: |
|
|
lines = f.readlines() |
|
|
for line in lines: |
|
|
article = json.loads(line) |
|
|
text = article['text'].replace('॥',' ॥\n') |
|
|
text = article['text'].replace('।',' ।\n') |
|
|
text = article['text'].replace('.',' .\n') |
|
|
for s in ['\"',"\'",'?','.',',','(',')','[',']','{','}','-','|',';',':','/','\\','=','’','—','‘','`','!','@','$','~','&','^','%','“','”','+','*','।']: |
|
|
text = text.replace(s,' '+s+' ') |
|
|
ref_tag = re.compile(r'<ref( name= \" .*? \" )*>.*</ref>') |
|
|
text = re.sub(ref_tag,'',text) |
|
|
ref_tag = re.compile(r'<.*?>') |
|
|
text = re.sub(ref_tag,'',text) |
|
|
text = re.sub(' +',' ',text) |
|
|
text = re.sub(r"\n+",'\n',text) |
|
|
text = re.sub(r"[A-Z|a-z]",'',text) |
|
|
text = re.sub(r"[0-9]+",' \1 ',text) |
|
|
sentences = re.split(r"\n",text) |
|
|
for sent in sentences: |
|
|
sentence = sent.strip() |
|
|
if sentence == '': |
|
|
continue |
|
|
out_file.write(sentence+'\n') |
|
|
tokens = sentence.split() |
|
|
for tkn in tokens: |
|
|
if tkn not in vocab: |
|
|
vocab[tkn] = 1 |
|
|
else: |
|
|
vocab[tkn] += 1 |
|
|
out_file.close() |
|
|
with open('vocabulary.txt','a') as voc: |
|
|
for w in vocab: |
|
|
voc.write(w+'\n') |
|
|
voc.write('<UNK>'+'\n') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__=='__main__': |
|
|
GenSentVocab() |