Spaces:
Sleeping
Sleeping
| import stanza | |
| from stanza.utils.conll import CoNLL | |
| import sys | |
| import os | |
| def main(input_file): | |
| # Directory and model paths | |
| model_dir = os.path.join('models') | |
| model_path = os.path.join(model_dir, 'genipapo.pt') | |
| # Check if the model file exists | |
| if not os.path.exists(model_path): | |
| print("Genipapo model not found. Please run 'download_model.py' first to download the model.") | |
| return | |
| # Initialize the Stanza pipeline with the custom dependency parser model | |
| nlp = stanza.Pipeline( | |
| lang='pt', | |
| processors='depparse', | |
| depparse_pretagged=True, # Assumes the input file has POS tags already | |
| depparse_model_path=model_path, | |
| tokenize_pretokenized=True, # Assumes tokens are already split in .conllu format | |
| use_gpu=False, | |
| download_method=None | |
| ) | |
| # Process each sentence in the input CoNLL-U file | |
| doc = CoNLL.conll2doc(input_file=input_file) | |
| parsed_doc = nlp(doc) | |
| # Update original document with parsed dependency information | |
| for orig_sentence, parsed_sentence in zip(doc.sentences, parsed_doc.sentences): | |
| for orig_word, parsed_word in zip(orig_sentence.words, parsed_sentence.words): | |
| orig_word.head = parsed_word.head | |
| orig_word.deprel = parsed_word.deprel | |
| # Save the updated document in CoNLL-U format | |
| output_file = 'output.conllu' | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write("{:C}".format(doc)) | |
| f.write('\n''\n') | |
| print(f"Updated CONLLU file saved to '{output_file}'") | |
| if __name__ == '__main__': | |
| if len(sys.argv) != 2: | |
| print("Usage: python run_parser.py path/to/your_file.conllu") | |
| else: | |
| input_file = sys.argv[1] | |
| if not os.path.exists(input_file): | |
| print(f"Input file {input_file} does not exist.") | |
| else: | |
| main(input_file) | |