Spaces:

bryankhelven
/

genipapo-parser

Sleeping

genipapo-parser / run_parser.py

Bryan Khelven

Initial deploy

ffdedc7 3 months ago

1.88 kB

	import stanza
	from stanza.utils.conll import CoNLL
	import sys
	import os

	def main(input_file):
	# Directory and model paths
	model_dir = os.path.join('models')
	model_path = os.path.join(model_dir, 'genipapo.pt')

	# Check if the model file exists
	if not os.path.exists(model_path):
	print("Genipapo model not found. Please run 'download_model.py' first to download the model.")
	return

	# Initialize the Stanza pipeline with the custom dependency parser model
	nlp = stanza.Pipeline(
	lang='pt',
	processors='depparse',
	depparse_pretagged=True, # Assumes the input file has POS tags already
	depparse_model_path=model_path,
	tokenize_pretokenized=True, # Assumes tokens are already split in .conllu format
	use_gpu=False,
	download_method=None
	)

	# Process each sentence in the input CoNLL-U file
	doc = CoNLL.conll2doc(input_file=input_file)
	parsed_doc = nlp(doc)

	# Update original document with parsed dependency information
	for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences):
	for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words):
	orig_word.head = parsed_word.head
	orig_word.deprel = parsed_word.deprel

	# Save the updated document in CoNLL-U format
	output_file = 'output.conllu'
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write("{:C}".format(doc))
	f.write('\n''\n')

	print(f"Updated CONLLU file saved to '{output_file}'")

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print("Usage: python run_parser.py path/to/your_file.conllu")
	else:
	input_file = sys.argv[1]
	if not os.path.exists(input_file):
	print(f"Input file {input_file} does not exist.")
	else:
	main(input_file)