Spaces:

suricodes
/

hindi-sindhi-docker

Paused

App Files Files Community

hindi-sindhi-docker / mosesdecoder /scripts /nbest-rescore /train.py

suricodes

Upload folder using huggingface_hub

fd49381 verified over 1 year ago

raw

history blame

4.61 kB

	#!/usr/bin/env python
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	import argparse
	import os
	import subprocess
	import sys

	# Feature field in N-best format
	FEAT_FIELD = 2

	# Location of mert, kbmira, etc. in relation to this script
	BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin')

	def main():

	# Args
	parser = argparse.ArgumentParser(description='Learn N-best rescoring weights')
	parser.add_argument('--nbest', metavar='nbest', \
	help='Dev set N-best list augmented with new features', required=True)
	parser.add_argument('--ref', metavar='ref', \
	help='Dev set reference translation', required=True)
	parser.add_argument('--working-dir', metavar='rescore-work', \
	help='Optimizer working directory', required=True)
	parser.add_argument('--bin-dir', metavar='DIR', \
	help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR)
	# Since we're starting with uniform weights and only running kbmira once,
	# run a gratuitous number of iterations. (mert-moses.pl default is 60
	# iterations for each Moses run)
	parser.add_argument('--iterations', metavar='N', type=int, \
	help='Number of K-best MIRA iterations to run (default: 300)', default=300)
	args = parser.parse_args()

	# Find executables
	extractor = os.path.join(args.bin_dir, 'extractor')
	kbmira = os.path.join(args.bin_dir, 'kbmira')
	for exe in (extractor, kbmira):
	if not os.path.exists(exe):
	sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir))
	sys.exit(1)

	# rescore-work dir
	if not os.path.exists(args.working_dir):
	os.mkdir(args.working_dir)

	# Feature names and numbers of weights from N-best list
	# Assume all features are dense (present for each entry)
	init_weights = []
	fields = [f.strip() for f in open(args.nbest).readline().split('\|\|\|')]
	feats = fields[FEAT_FIELD].split()
	for i in range(len(feats)):
	if feats[i].endswith('='):
	n_weights = 0
	j = i + 1
	while j < len(feats):
	if feats[j].endswith('='):
	break
	n_weights += 1
	j += 1
	# Start all weights at 0
	init_weights.append([feats[i], [0] * n_weights])

	# Extract score and feature data from N-best list
	extractor_cmd = [extractor, \
	'--sctype', 'BLEU', '--scconfig', 'case:true', \
	'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
	'--ffile', os.path.join(args.working_dir, 'features.dat'), \
	'-r', args.ref, \
	'-n', args.nbest]
	subprocess.call(extractor_cmd)

	# Write dense feature list
	with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out:
	for (feat, weights) in init_weights:
	for w in weights:
	out.write('{} {}\n'.format(feat, w))

	# Run K-best MIRA optimizer
	kbmira_cmd = [kbmira, \
	'--dense-init', os.path.join(args.working_dir, 'init.dense'), \
	'--ffile', os.path.join(args.working_dir, 'features.dat'), \
	'--scfile', os.path.join(args.working_dir, 'scores.dat'), \
	'-o', os.path.join(args.working_dir, 'mert.out'), \
	'--iters', str(args.iterations)]
	subprocess.call(kbmira_cmd)

	# Read optimized weights, sum for normalization
	opt_weights = []
	total = 0
	with open(os.path.join(args.working_dir, 'mert.out')) as inp:
	# Same structure as original weight list
	for (feat, weights) in init_weights:
	opt_weights.append([feat, []])
	for _ in weights:
	w = float(inp.readline().split()[1])
	opt_weights[-1][1].append(w)
	# Sum for normalization
	total += abs(w)

	# Normalize weights
	for (_, weights) in opt_weights:
	for i in range(len(weights)):
	weights[i] /= total

	# Generate rescore.ini
	with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out:
	out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n')
	out.write('\n')
	out.write('[weight]\n')
	for (feat, weights) in opt_weights:
	out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights)))

	if __name__ == '__main__':
	main()