Spaces:

sumanthd
/

IndicTrans-MultilingualTranslation

Build error

App Files Files Community

IndicTrans-MultilingualTranslation / scripts /extract_non_english_pairs.py

Shanks0465

Added Multilingual Interface

d984b22 over 3 years ago

raw

history blame contribute delete

4.69 kB

	from tqdm import tqdm
	import os
	from collections import defaultdict


	def read_file(fname):
	with open(fname, "r", encoding="utf-8") as infile:
	for line in infile:
	yield line.strip()


	def extract_non_english_pairs(indir, outdir, LANGS):
	"""
	Extracts non-english pair parallel corpora

	indir: contains english centric data in the following form:
	- directory named en-xx for language xx
	- each directory contains a train.en and train.xx
	outdir: output directory to store mined data for each pair.
	One directory is created for each pair.
	LANGS: list of languages in the corpus (other than English).
	The language codes must correspond to the ones used in the
	files and directories in indir. Prefarably, sort the languages
	in this list in alphabetic order. outdir will contain data for xx-yy,
	but not for yy-xx, so it will be convenient to have this list in sorted order.
	"""

	for i in tqdm(range(len(LANGS) - 1)):
	print()
	for j in range(i + 1, len(LANGS)):
	lang1 = LANGS[i]
	lang2 = LANGS[j]
	# print()
	print("{} {}".format(lang1, lang2))

	fname1 = "{}/en-{}/train.en".format(indir, lang1)
	fname2 = "{}/en-{}/train.en".format(indir, lang2)
	# print(fname1)
	# print(fname2)
	enset_l1 = set(read_file(fname1))
	common_en_set = enset_l1.intersection(read_file(fname2))

	## this block should be used if you want to consider multiple translations.
	# il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1)
	# en_lang1_dict = defaultdict(list)
	# for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)):
	# if en_line in common_en_set:
	# en_lang1_dict[en_line].append(il_line)

	# # this block should be used if you DONT to consider multiple translation.
	il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1)
	en_lang1_dict={}
	for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)):
	if en_line in common_en_set:
	en_lang1_dict[en_line]=il_line

	os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True)
	out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
	o=outdir, l1=lang1, l2=lang2
	)
	out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format(
	o=outdir, l1=lang1, l2=lang2
	)

	il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2)
	with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open(
	out_l2_fname, "w", encoding="utf-8"
	) as out_l2_file:
	for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)):
	if en_line in en_lang1_dict:

	# this block should be used if you want to consider multiple tranlations.
	for il_line_lang1 in en_lang1_dict[en_line]:
	# lang1_line, lang2_line = il_line_lang1, il_line
	# out_l1_file.write(lang1_line + "\n")
	# out_l2_file.write(lang2_line + "\n")

	# this block should be used if you DONT to consider multiple translation.
	lang1_line, lang2_line = en_lang1_dict[en_line], il_line
	out_l1_file.write(lang1_line+'\n')
	out_l2_file.write(lang2_line+'\n')


	def get_extracted_stats(outdir, LANGS):
	"""
	gathers stats from the extracted directories

	outdir: output directory to store mined data for each pair.
	One directory is created for each pair.
	LANGS: list of languages in the corpus (other than languages).
	The language codes must correspond to the ones used in the
	files and directories in indir. Prefarably, sort the languages
	in this list in alphabetic order. outdir will contain data for xx-yy,
	"""
	common_stats = []
	for i in tqdm(range(len(LANGS) - 1)):
	for j in range(i + 1, len(LANGS)):
	lang1 = LANGS[i]
	lang2 = LANGS[j]

	out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
	o=outdir, l1=lang1, l2=lang2
	)

	cnt = sum([1 for _ in read_file(out_l1_fname)])
	common_stats.append((lang1, lang2, cnt))
	common_stats.append((lang2, lang1, cnt))
	return common_stats