varox34
/

Dependency-Parser

Model card Files Files and versions

Dependency-Parser / sentVocabGen.py

varox34's picture

Upload 64 files

366b225 verified about 2 years ago

history blame contribute delete

2.28 kB

	import os
	import json
	import re


	# Command to extract:
	# python WikiExtractor.py ../data/hiwiki-20200220-pages-articles.xml.bz2 --json -de gallery,timeline,noinclude --filter_disambig_pages

	def GenSentVocab():
	folders = os.listdir('text')
	vocab = {}
	out_file = open('sentences.txt','a')
	for fldr in folders:
	fldr_pth = os.path.join('text',fldr)
	files = os.listdir(fldr_pth)
	for fl in files:
	fl_pth = os.path.join(fldr_pth,fl)
	with open(fl_pth,'r') as f:
	lines = f.readlines()
	for line in lines:
	article = json.loads(line)
	text = article['text'].replace('॥',' ॥\n')
	text = article['text'].replace('।',' ।\n')
	text = article['text'].replace('.',' .\n')
	for s in ['\"',"\'",'?','.',',','(',')','[',']','{','}','-','\|',';',':','/','\\','=','’','—','‘','`','!','@','$','~','&','^','%','“','”','+','*','।']:
	text = text.replace(s,' '+s+' ')
	ref_tag = re.compile(r'<ref( name= \" .? \" )>.*</ref>')
	text = re.sub(ref_tag,'',text)
	ref_tag = re.compile(r'<.*?>')
	text = re.sub(ref_tag,'',text)
	text = re.sub(' +',' ',text)
	text = re.sub(r"\n+",'\n',text)
	text = re.sub(r"[A-Z\|a-z]",'',text)
	text = re.sub(r"[0-9]+",' \1 ',text)
	sentences = re.split(r"\n",text)
	for sent in sentences:
	sentence = sent.strip()
	if sentence == '':
	continue
	out_file.write(sentence+'\n')
	tokens = sentence.split()
	for tkn in tokens:
	if tkn not in vocab:
	vocab[tkn] = 1
	else:
	vocab[tkn] += 1
	out_file.close()
	with open('vocabulary.txt','a') as voc:
	for w in vocab:
	voc.write(w+'\n')
	voc.write('<UNK>'+'\n')




	if __name__=='__main__':
	GenSentVocab()