| | import logging |
| | import os |
| | from typing import TextIO, Union |
| |
|
| | from conllu import parse_incr |
| | from utils_ner import InputExample, Split, TokenClassificationTask |
| |
|
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class NER(TokenClassificationTask): |
| | def __init__(self, label_idx=-1): |
| | |
| | self.label_idx = label_idx |
| |
|
| | def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]: |
| | if isinstance(mode, Split): |
| | mode = mode.value |
| | file_path = os.path.join(data_dir, f"{mode}.txt") |
| | guid_index = 1 |
| | examples = [] |
| | with open(file_path, encoding="utf-8") as f: |
| | words = [] |
| | labels = [] |
| | for line in f: |
| | if line.startswith("-DOCSTART-") or line == "" or line == "\n": |
| | if words: |
| | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) |
| | guid_index += 1 |
| | words = [] |
| | labels = [] |
| | else: |
| | splits = line.split(" ") |
| | words.append(splits[0]) |
| | if len(splits) > 1: |
| | labels.append(splits[self.label_idx].replace("\n", "")) |
| | else: |
| | |
| | labels.append("O") |
| | if words: |
| | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) |
| | return examples |
| |
|
| | def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list): |
| | example_id = 0 |
| | for line in test_input_reader: |
| | if line.startswith("-DOCSTART-") or line == "" or line == "\n": |
| | writer.write(line) |
| | if not preds_list[example_id]: |
| | example_id += 1 |
| | elif preds_list[example_id]: |
| | output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" |
| | writer.write(output_line) |
| | else: |
| | logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) |
| |
|
| | def get_labels(self, path: str) -> list[str]: |
| | if path: |
| | with open(path) as f: |
| | labels = f.read().splitlines() |
| | if "O" not in labels: |
| | labels = ["O"] + labels |
| | return labels |
| | else: |
| | return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] |
| |
|
| |
|
| | class Chunk(NER): |
| | def __init__(self): |
| | |
| | super().__init__(label_idx=-2) |
| |
|
| | def get_labels(self, path: str) -> list[str]: |
| | if path: |
| | with open(path) as f: |
| | labels = f.read().splitlines() |
| | if "O" not in labels: |
| | labels = ["O"] + labels |
| | return labels |
| | else: |
| | return [ |
| | "O", |
| | "B-ADVP", |
| | "B-INTJ", |
| | "B-LST", |
| | "B-PRT", |
| | "B-NP", |
| | "B-SBAR", |
| | "B-VP", |
| | "B-ADJP", |
| | "B-CONJP", |
| | "B-PP", |
| | "I-ADVP", |
| | "I-INTJ", |
| | "I-LST", |
| | "I-PRT", |
| | "I-NP", |
| | "I-SBAR", |
| | "I-VP", |
| | "I-ADJP", |
| | "I-CONJP", |
| | "I-PP", |
| | ] |
| |
|
| |
|
| | class POS(TokenClassificationTask): |
| | def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> list[InputExample]: |
| | if isinstance(mode, Split): |
| | mode = mode.value |
| | file_path = os.path.join(data_dir, f"{mode}.txt") |
| | guid_index = 1 |
| | examples = [] |
| |
|
| | with open(file_path, encoding="utf-8") as f: |
| | for sentence in parse_incr(f): |
| | words = [] |
| | labels = [] |
| | for token in sentence: |
| | words.append(token["form"]) |
| | labels.append(token["upos"]) |
| | assert len(words) == len(labels) |
| | if words: |
| | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels)) |
| | guid_index += 1 |
| | return examples |
| |
|
| | def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: list): |
| | example_id = 0 |
| | for sentence in parse_incr(test_input_reader): |
| | s_p = preds_list[example_id] |
| | out = "" |
| | for token in sentence: |
| | out += f"{token['form']} ({token['upos']}|{s_p.pop(0)}) " |
| | out += "\n" |
| | writer.write(out) |
| | example_id += 1 |
| |
|
| | def get_labels(self, path: str) -> list[str]: |
| | if path: |
| | with open(path) as f: |
| | return f.read().splitlines() |
| | else: |
| | return [ |
| | "ADJ", |
| | "ADP", |
| | "ADV", |
| | "AUX", |
| | "CCONJ", |
| | "DET", |
| | "INTJ", |
| | "NOUN", |
| | "NUM", |
| | "PART", |
| | "PRON", |
| | "PROPN", |
| | "PUNCT", |
| | "SCONJ", |
| | "SYM", |
| | "VERB", |
| | "X", |
| | ] |
| |
|