Spaces:
Runtime error
Runtime error
| from operator import itemgetter; | |
| import os.path; | |
| import re; | |
| import xml.etree.ElementTree as ET; | |
| from graph import Graph; | |
| def walk(id, node, parent, nodes, edges, ns): | |
| i = node.get("id"); | |
| o = node.findtext(ns + "ord"); | |
| if i is None or o is None and parent is not None: | |
| raise Exception("treex.walk(): " | |
| "missing ‘id’ or ‘ord’ values while decoding tree #{}; exit." | |
| "".format(id)); | |
| nodes.append((i, int(o) if o is not None else 0, node)); | |
| if edges is not None: | |
| functor = node.findtext(ns + "functor"); | |
| if parent is not None and functor is not None: | |
| edges.append((parent, i, functor)); | |
| children = node.find(ns + "children"); | |
| if children is not None: | |
| for child in children: | |
| if child.tag == ns + "LM": | |
| walk(id, child, i, nodes, edges, ns); | |
| if children.find(ns + "LM") is None: | |
| walk(id, children, i, nodes, edges, ns); | |
| def read(fp, text = None): | |
| ns = "{http://ufal.mff.cuni.cz/pdt/pml/}"; | |
| # | |
| # _fix_me_ | |
| # factor out the anchor()ing code into a reusable form. (oe; 4-apr-20) | |
| # | |
| n = None; | |
| i = 0; | |
| def skip(): | |
| nonlocal i; | |
| while i < n and graph.input[i] in {" ", "\t"}: | |
| i += 1; | |
| def scan(candidates): | |
| for candidate in candidates: | |
| if graph.input.startswith(candidate, i): | |
| return len(candidate); | |
| def anchor(form): | |
| nonlocal i; | |
| skip(); | |
| m = None; | |
| if graph.input.startswith(form, i): | |
| m = len(form); | |
| else: | |
| for old, new in {("‘", "`"), ("’", "'")}: | |
| form = form.replace(old, new); | |
| if graph.input.startswith(form, i): | |
| m = len(form); | |
| break; | |
| if not m: | |
| m = scan({"“", "\"", "``"}) or scan({"‘", "`"}) \ | |
| or scan({"”", "\"", "''"}) or scan({"’", "'"}) \ | |
| or scan({"—", "—", "---", "--"}) \ | |
| or scan({"…", "...", ". . ."}); | |
| if m: | |
| anchor = {"from": i, "to": i + m}; | |
| i += m; | |
| skip(); | |
| return anchor; | |
| else: | |
| raise Exception("{}: failed to anchor |{}| in |{}| ({})" | |
| "".format(graph.id, form, graph.input, i)); | |
| tree = ET.parse(fp).getroot(); | |
| bundles = tree.find(ns + "bundles"); | |
| for item in bundles.findall(ns + "LM"): | |
| id = item.get("id"); | |
| graph = Graph(id, flavor = 0, framework = "ptg"); | |
| surface = list(); nodes = list(); edges = list(); | |
| for zone in item.iter(ns + "zone"): | |
| if zone.get("language") == "en": | |
| sentence = zone.findtext(ns + "sentence"); | |
| trees = zone.find(ns + "trees"); | |
| if trees is not None: | |
| atree = trees.find(ns + "a_tree"); | |
| ttree = trees.find(ns + "t_tree"); | |
| root = atree.find(ns + "children"); | |
| top = ttree.find(ns + "children"); | |
| # print(id, sentence, atree, ttree, root, top); | |
| if root is None or top is None: | |
| raise Exception("treex.read(): " | |
| "missing ‘a_tree’ or ‘t_tree’ values while decoding tree #{}; exit." | |
| "".format(id)); | |
| walk(id, root, None, surface, None, ns); | |
| walk(id, top, None, nodes, edges, ns); | |
| # | |
| # determine character-based anchors for all .surface. (analytical) tokens | |
| # | |
| anchoring = dict(); | |
| if sentence is not None: | |
| graph.add_input(sentence); | |
| n = len(graph.input); | |
| i = 0; | |
| for node in sorted(surface, key = itemgetter(1)): | |
| anchoring[node[0]] = anchor(node[2].findtext(ns + "form")); | |
| # | |
| # now process tectogrammatical nodes in surface order (as indicated in the | |
| # annotations): map to consecutive numerical identifiers; retrieve anchors | |
| # from corresponding analytical nodes; and create actual (new) graph nodes. | |
| # | |
| mapping = {}; | |
| to = 0; | |
| for node in sorted(nodes, key = itemgetter(1)): | |
| mapping[node[0]] = i = len(mapping); | |
| properties = dict(); | |
| a = node[2].find(ns + "a"); | |
| if a is not None: | |
| anchors = list(); | |
| for lex in a: | |
| if len(lex) == 0: | |
| anchors.append(anchoring[lex.text]); | |
| else: | |
| for lm in lex.findall(ns + "LM"): | |
| anchors.append(anchoring[lm.text]); | |
| anchors = sorted(anchors, key = itemgetter("to")); | |
| to = anchors[-1]["to"]; | |
| else: | |
| # | |
| # _fix_me_ | |
| # discuss anchoring of generated nodes: currently, for uniformity, we | |
| # anchor them to an empty string immediately after the final character | |
| # of the preceding non-generated node. but this arguably introduces a | |
| # vacuous piece of information, unless one were to argue that it rather | |
| # is an encoding of the node status for generated nodes? (oe; 4-apr-20) | |
| # | |
| anchors = [{"from": to, "to": to}]; | |
| # | |
| # the node label comes from the tectogrammatical lemma | |
| # | |
| lemma = node[2].findtext(ns + "t_lemma"); | |
| frame = node[2].findtext(ns + "val_frame.rf"); | |
| # | |
| # where present (mostly on verbs), extract the valency frame identifier | |
| # _fix_me_ | |
| # for compatibility with earlier PSD releases, strip prefix that seems to | |
| # identify the valency dictionary. (oe; 4-apr-20) | |
| # | |
| if frame is not None: | |
| if "#" in frame: | |
| properties["frame"] = frame[frame.index("#") + 1:]; | |
| else: | |
| properties["frame"] = frame; | |
| # | |
| # selectively expose grammatemes as node-local properties, but ignore | |
| # (vanilla but very high-frequent) default values | |
| # | |
| grammatemes = node[2].find(ns + "gram"); | |
| if grammatemes is not None: | |
| for property, default in [("tense", {"nil"}), ("negation", {"neg0"})]: | |
| match = grammatemes.findtext(ns + property); | |
| if match is not None and match not in default: | |
| properties[property] = match; | |
| graph.add_node(id = i, label = lemma, anchors = anchors, | |
| properties = properties.keys(), | |
| values = properties.values(), | |
| top = node[0] == top.get("id")); | |
| # | |
| # similarly, record all edges, now using mapped identifiers | |
| # | |
| for source, target, label in edges: | |
| graph.add_edge(mapping[source], mapping[target], label); | |
| # | |
| # in a second pass (so that all internal identifiers are mapped already), | |
| # create edges reflecting coreference annotations. | |
| # | |
| for node in nodes: | |
| coref = node[2].findtext(ns + "coref_gram.rf"); | |
| if coref is not None: | |
| graph.add_edge(mapping[node[0]], mapping[coref], "coref_gram"); | |
| yield graph, None; | |