Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| # coding=utf-8 | |
| import json | |
| from itertools import chain | |
| from transformers import AutoTokenizer | |
| from utility.subtokenize import subtokenize | |
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| def load_dataset(path): | |
| data = {} | |
| with open(path, encoding="utf8") as f: | |
| for sentence in f.readlines(): | |
| sentence = json.loads(sentence) | |
| data[sentence["id"]] = sentence | |
| if "nodes" not in sentence: | |
| sentence["nodes"] = [] | |
| if "edges" not in sentence: | |
| sentence["edges"] = [] | |
| for sample in list(data.values()): | |
| sample["sentence"] = sample["input"] | |
| sample["input"] = sample["sentence"].split(' ') | |
| sample["token anchors"], offset = [], 0 | |
| for token in sample["input"]: | |
| sample["token anchors"].append({"from": offset, "to": offset + len(token)}) | |
| offset += len(token) + 1 | |
| return data | |
| def node_generator(data): | |
| for d in data.values(): | |
| for n in d["nodes"]: | |
| yield n, d | |
| def anchor_ids_from_intervals(data): | |
| for node, sentence in node_generator(data): | |
| if "anchors" not in node: | |
| node["anchors"] = [] | |
| node["anchors"] = sorted(node["anchors"], key=lambda a: (a["from"], a["to"])) | |
| node["token references"] = set() | |
| for anchor in node["anchors"]: | |
| for i, token_anchor in enumerate(sentence["token anchors"]): | |
| if token_anchor["to"] <= anchor["from"]: | |
| continue | |
| if token_anchor["from"] >= anchor["to"]: | |
| break | |
| node["token references"].add(i) | |
| node["anchor intervals"] = node["anchors"] | |
| node["anchors"] = sorted(list(node["token references"])) | |
| del node["token references"] | |
| for sentence in data.values(): | |
| sentence["token anchors"] = [[a["from"], a["to"]] for a in sentence["token anchors"]] | |
| def create_bert_tokens(data, encoder: str): | |
| tokenizer = AutoTokenizer.from_pretrained(encoder, use_fast=True) | |
| for sentence in data.values(): | |
| sentence["bert input"], sentence["to scatter"] = subtokenize(sentence["input"], tokenizer) | |
| def create_edges(sentence, label_f=None): | |
| N = len(sentence["nodes"]) | |
| sentence["edge presence"] = [N, N, []] | |
| sentence["edge labels"] = [N, N, []] | |
| for e in sentence["edges"]: | |
| source, target = e["source"], e["target"] | |
| label = e["label"] if "label" in e else "none" | |
| if label_f is not None: | |
| label = label_f(label) | |
| sentence["edge presence"][-1].append((source, target, 1)) | |
| sentence["edge labels"][-1].append((source, target, label)) | |
| edge_counter = len(sentence["edge presence"][-1]) | |
| return edge_counter | |