syntax = "proto2"; package edu.stanford.nlp.pipeline; option java_package = "edu.stanford.nlp.pipeline"; option java_outer_classname = "CoreNLPProtos"; // // From JAVANLP_HOME, you can build me with the command: // // protoc -I=src/edu/stanford/nlp/pipeline/ --java_out=src src/edu/stanford/nlp/pipeline/CoreNLP.proto // // // To do the python version: // // protoc -I=./doc --python_out=./stanza/protobuf ./doc/CoreNLP.proto // // // An enumeration for the valid languages allowed in CoreNLP // enum Language { Unknown = 0; Any = 1; Arabic = 2; Chinese = 3; English = 4; German = 5; French = 6; Hebrew = 7; Spanish = 8; UniversalEnglish = 9; UniversalChinese = 10; } // // A document; that is, the equivalent of an Annotation. // message Document { required string text = 1; repeated Sentence sentence = 2; repeated CorefChain corefChain = 3; optional string docID = 4; optional string docDate = 7; optional uint64 calendar = 8; /** * A peculiar field, for the corner case when a Document is * serialized without any sentences. Otherwise */ repeated Token sentencelessToken = 5; repeated Token character = 10; repeated Quote quote = 6; /** * This field is for entity mentions across the document. */ repeated NERMention mentions = 9; optional bool hasEntityMentionsAnnotation = 13; // used to differentiate between null and empty list /** * xml information */ optional bool xmlDoc = 11; repeated Section sections = 12; /** coref mentions for entire document **/ repeated Mention mentionsForCoref = 14; optional bool hasCorefMentionAnnotation = 15; optional bool hasCorefAnnotation = 16; repeated int32 corefMentionToEntityMentionMappings = 17; repeated int32 entityMentionToCorefMentionMappings = 18; extensions 100 to 255; } // // The serialized version of a CoreMap representing a sentence. // message Sentence { repeated Token token = 1; required uint32 tokenOffsetBegin = 2; required uint32 tokenOffsetEnd = 3; optional uint32 sentenceIndex = 4; optional uint32 characterOffsetBegin = 5; optional uint32 characterOffsetEnd = 6; optional ParseTree parseTree = 7; optional ParseTree binarizedParseTree = 31; optional ParseTree annotatedParseTree = 32; optional string sentiment = 33; repeated ParseTree kBestParseTrees = 34; optional DependencyGraph basicDependencies = 8; optional DependencyGraph collapsedDependencies = 9; optional DependencyGraph collapsedCCProcessedDependencies = 10; optional DependencyGraph alternativeDependencies = 13; repeated RelationTriple openieTriple = 14; // The OpenIE triples in the sentence repeated RelationTriple kbpTriple = 16; // The KBP triples in this sentence repeated SentenceFragment entailedSentence = 15; // The entailed sentences, by natural logic repeated SentenceFragment entailedClause = 35; // The entailed clauses, by natural logic optional DependencyGraph enhancedDependencies = 17; optional DependencyGraph enhancedPlusPlusDependencies = 18; repeated Token character = 19; optional uint32 paragraph = 11; optional string text = 12; // Only needed if we're only saving the sentence. optional uint32 lineNumber = 20; // Fields set by other annotators in CoreNLP optional bool hasRelationAnnotations = 51; repeated Entity entity = 52; repeated Relation relation = 53; optional bool hasNumerizedTokensAnnotation = 54; repeated NERMention mentions = 55; repeated Mention mentionsForCoref = 56; optional bool hasCorefMentionsAnnotation = 57; optional string sentenceID = 58; // Useful when storing sentences (e.g. ForEach) optional string sectionDate = 59; // date of section optional uint32 sectionIndex = 60; // section index for this sentence's section optional string sectionName = 61; // name of section optional string sectionAuthor = 62; // author of section optional string docID = 63; // doc id optional bool sectionQuoted = 64; // is this sentence in an xml quote in a post optional bool hasEntityMentionsAnnotation = 65; // check if there are entity mentions optional bool hasKBPTriplesAnnotation = 68; // check if there are KBP triples optional bool hasOpenieTriplesAnnotation = 69; // check if there are OpenIE triples // quote stuff optional uint32 chapterIndex = 66; optional uint32 paragraphIndex = 67; // the quote annotator can soometimes add merged sentences optional Sentence enhancedSentence = 70; // speaker stuff optional string speaker = 71; // The speaker speaking this sentence optional string speakerType = 72; // The type of speaker speaking this sentence extensions 100 to 255; } // // The serialized version of a Token (a CoreLabel). // message Token { // Fields set by the default annotators [new CoreNLP(new Properties())] optional string word = 1; // the word's gloss (post-tokenization) optional string pos = 2; // The word's part of speech tag optional string value = 3; // The word's 'value', (e.g., parse tree node) optional string category = 4; // The word's 'category' (e.g., parse tree node) optional string before = 5; // The whitespace/xml before the token optional string after = 6; // The whitespace/xml after the token optional string originalText = 7; // The original text for this token optional string ner = 8; // The word's NER tag optional string coarseNER = 62; // The word's coarse NER tag optional string fineGrainedNER = 63; // The word's fine-grained NER tag repeated string nerLabelProbs = 66; // listing of probs optional string normalizedNER = 9; // The word's normalized NER tag optional string lemma = 10; // The word's lemma optional uint32 beginChar = 11; // The character offset begin, in the document optional uint32 endChar = 12; // The character offset end, in the document optional uint32 utterance = 13; // The utterance tag used in dcoref optional string speaker = 14; // The speaker speaking this word optional string speakerType = 77; // The type of speaker speaking this word optional uint32 beginIndex = 15; // The begin index of, e.g., a span optional uint32 endIndex = 16; // The begin index of, e.g., a span optional uint32 tokenBeginIndex = 17; // The begin index of the token optional uint32 tokenEndIndex = 18; // The end index of the token optional Timex timexValue = 19; // The time this word refers to optional bool hasXmlContext = 21; // Used by clean xml annotator repeated string xmlContext = 22; // Used by clean xml annotator optional uint32 corefClusterID = 23; // The [primary] cluster id for this token optional string answer = 24; // A temporary annotation which is occasionally left in // optional string projectedCategory = 25; // The syntactic category of the maximal constituent headed by the word. Not used anywhere, so deleted. optional uint32 headWordIndex = 26; // The index of the head word of this word. optional Operator operator = 27; // If this is an operator, which one is it and what is its scope (as per Natural Logic)? optional Polarity polarity = 28; // The polarity of this word, according to Natural Logic optional string polarity_dir = 39; // The polarity of this word, either "up", "down", or "flat" optional Span span = 29; // The span of a leaf node of a tree optional string sentiment = 30; // The final sentiment of the sentence optional int32 quotationIndex = 31; // The index of the quotation this token refers to optional MapStringString conllUFeatures = 32; optional string coarseTag = 33; // The coarse POS tag (used to store the UPOS tag) optional Span conllUTokenSpan = 34; optional string conllUMisc = 35; optional MapStringString conllUSecondaryDeps = 36; optional string wikipediaEntity = 37; optional bool isNewline = 38; // Fields set by other annotators in CoreNLP optional string gender = 51; // gender annotation (machine reading) optional string trueCase = 52; // true case type of token optional string trueCaseText = 53; // true case gloss of token // Chinese character info optional string chineseChar = 54; optional string chineseSeg = 55; optional string chineseXMLChar = 60; // Arabic character info optional string arabicSeg = 76; // Section info optional string sectionName = 56; optional string sectionAuthor = 57; optional string sectionDate = 58; optional string sectionEndLabel = 59; // French tokens have parents optional string parent = 61; // mention index info repeated uint32 corefMentionIndex = 64; optional uint32 entityMentionIndex = 65; // mwt stuff optional bool isMWT = 67; optional bool isFirstMWT = 68; optional string mwtText = 69; // setting this to a map might be nice, but there are a couple issues // for one, there can be values with no key // for another, it's a pain to correctly parse, since different treebanks // can have different standards for how to write out the misc field optional string mwtMisc = 78; // number info optional uint64 numericValue = 70; optional string numericType = 71; optional uint64 numericCompositeValue = 72; optional string numericCompositeType = 73; optional uint32 codepointOffsetBegin = 74; optional uint32 codepointOffsetEnd = 75; // Fields in the CoreLabel java class that are moved elsewhere // string text @see Document#text + character offsets // uint32 sentenceIndex @see Sentence#sentenceIndex // string docID @see Document#docID // uint32 paragraph @see Sentence#paragraph // Most serialized annotations will not have this // Some code paths may not correctly process this if serialized, // since many places will read the index off the position in a sentence // In particular, deserializing a Document using ProtobufAnnotationSerializer // will clobber any index value // But Semgrex and Ssurgeon in particular need a way // to pass around nodes where the node's index is not strictly 1, 2, 3, ... // thanks to the empty nodes in UD treebanks such as // English EWT or Estonian EWT (not related to each other) optional uint32 index = 79; optional uint32 emptyIndex = 80; extensions 100 to 255; } // // An enumeration of valid sentiment values for the sentiment classifier. // enum Sentiment { STRONG_NEGATIVE = 0; WEAK_NEGATIVE = 1; NEUTRAL = 2; WEAK_POSITIVE = 3; STRONG_POSITIVE = 4; } // // A quotation marker in text // message Quote { optional string text = 1; optional uint32 begin = 2; optional uint32 end = 3; optional uint32 sentenceBegin = 5; optional uint32 sentenceEnd = 6; optional uint32 tokenBegin = 7; optional uint32 tokenEnd = 8; optional string docid = 9; optional uint32 index = 10; optional string author = 11; optional string mention = 12; optional uint32 mentionBegin = 13; optional uint32 mentionEnd = 14; optional string mentionType = 15; optional string mentionSieve = 16; optional string speaker = 17; optional string speakerSieve = 18; optional string canonicalMention = 19; optional uint32 canonicalMentionBegin = 20; optional uint32 canonicalMentionEnd = 21; optional DependencyGraph attributionDependencyGraph = 22; } // // A syntactic parse tree, with scores. // message ParseTree { repeated ParseTree child = 1; optional string value = 2; optional uint32 yieldBeginIndex = 3; optional uint32 yieldEndIndex = 4; optional double score = 5; optional Sentiment sentiment = 6; } // // A dependency graph representation. // message DependencyGraph { message Node { required uint32 sentenceIndex = 1; required uint32 index = 2; optional uint32 copyAnnotation = 3; optional uint32 emptyIndex = 4; } message Edge { required uint32 source = 1; required uint32 target = 2; optional string dep = 3; optional bool isExtra = 4; optional uint32 sourceCopy = 5; optional uint32 targetCopy = 6; optional uint32 sourceEmpty = 8; optional uint32 targetEmpty = 9; optional Language language = 7 [default=Unknown]; } repeated Node node = 1; repeated Edge edge = 2; repeated uint32 root = 3 [packed=true]; // optional: if this graph message is not part of a larger context, // the tokens will help reconstruct the actual sentence repeated Token token = 4; // The values in this field will index directly into the node list // This is useful so that additional information such as emptyIndex // can be considered without having to pass it around a second time repeated uint32 rootNode = 5 [packed=true]; } // // A coreference chain. // These fields are not *really* optional. CoreNLP will crash without them. // message CorefChain { message CorefMention { optional int32 mentionID = 1; optional string mentionType = 2; optional string number = 3; optional string gender = 4; optional string animacy = 5; optional uint32 beginIndex = 6; optional uint32 endIndex = 7; optional uint32 headIndex = 9; optional uint32 sentenceIndex = 10; optional uint32 position = 11; // the second element of position } required int32 chainID = 1; repeated CorefMention mention = 2; required uint32 representative = 3; } // // a mention // message Mention { optional int32 mentionID = 1; optional string mentionType = 2; optional string number = 3; optional string gender = 4; optional string animacy = 5; optional string person = 6; optional uint32 startIndex = 7; optional uint32 endIndex = 9; optional int32 headIndex = 10; optional string headString = 11; optional string nerString = 12; optional int32 originalRef = 13; optional int32 goldCorefClusterID = 14; optional int32 corefClusterID = 15; optional int32 mentionNum = 16; optional int32 sentNum = 17; optional int32 utter = 18; optional int32 paragraph = 19; optional bool isSubject = 20; optional bool isDirectObject = 21; optional bool isIndirectObject = 22; optional bool isPrepositionObject = 23; optional bool hasTwin = 24; optional bool generic = 25; optional bool isSingleton = 26; optional bool hasBasicDependency = 27; optional bool hasEnhancedDependency = 28; optional bool hasContextParseTree = 29; optional IndexedWord headIndexedWord = 30; optional IndexedWord dependingVerb = 31; optional IndexedWord headWord = 32; optional SpeakerInfo speakerInfo = 33; repeated IndexedWord sentenceWords = 50; repeated IndexedWord originalSpan = 51; repeated string dependents = 52; repeated string preprocessedTerms = 53; repeated int32 appositions = 54; repeated int32 predicateNominatives = 55; repeated int32 relativePronouns = 56; repeated int32 listMembers = 57; repeated int32 belongToLists = 58; } // // store the position (sentence, token index) of a CoreLabel // message IndexedWord { optional int32 sentenceNum = 1; optional int32 tokenIndex = 2; optional int32 docID = 3; optional uint32 copyCount = 4; } // // speaker info, this is used for Mentions // message SpeakerInfo { optional string speakerName = 1; repeated int32 mentions = 2; } // // A Span of text // message Span { required uint32 begin = 1; required uint32 end = 2; } // // A Timex object, representing a temporal expression (TIMe EXpression) // These fields are not *really* optional. CoreNLP will crash without them. // message Timex { optional string value = 1; optional string altValue = 2; optional string text = 3; optional string type = 4; optional string tid = 5; optional uint32 beginPoint = 6; optional uint32 endPoint = 7; } // // A representation of an entity in a relation. // This corresponds to the EntityMention, and more broadly the // ExtractionObject classes. // message Entity { optional uint32 headStart = 6; optional uint32 headEnd = 7; optional string mentionType = 8; optional string normalizedName = 9; optional uint32 headTokenIndex = 10; optional string corefID = 11; // inherited from ExtractionObject optional string objectID = 1; optional uint32 extentStart = 2; optional uint32 extentEnd = 3; optional string type = 4; optional string subtype = 5; // Implicit // uint32 sentence @see implicit in sentence } // // A representation of a relation, mirroring RelationMention // message Relation { repeated string argName = 6; repeated Entity arg = 7; optional string signature = 8; // inherited from ExtractionObject optional string objectID = 1; optional uint32 extentStart = 2; optional uint32 extentEnd = 3; optional string type = 4; optional string subtype = 5; // Implicit // uint32 sentence @see implicit in sentence } // // A Natural Logic operator // message Operator { required string name = 1; required int32 quantifierSpanBegin = 2; required int32 quantifierSpanEnd = 3; required int32 subjectSpanBegin = 4; required int32 subjectSpanEnd = 5; required int32 objectSpanBegin = 6; required int32 objectSpanEnd = 7; } // // The seven informative Natural Logic relations // enum NaturalLogicRelation { EQUIVALENCE = 0; FORWARD_ENTAILMENT = 1; REVERSE_ENTAILMENT = 2; NEGATION = 3; ALTERNATION = 4; COVER = 5; INDEPENDENCE = 6; } // // The polarity of a word, according to Natural Logic // message Polarity { required NaturalLogicRelation projectEquivalence = 1; required NaturalLogicRelation projectForwardEntailment = 2; required NaturalLogicRelation projectReverseEntailment = 3; required NaturalLogicRelation projectNegation = 4; required NaturalLogicRelation projectAlternation = 5; required NaturalLogicRelation projectCover = 6; required NaturalLogicRelation projectIndependence = 7; } // // An NER mention in the text // message NERMention { optional uint32 sentenceIndex = 1; required uint32 tokenStartInSentenceInclusive = 2; required uint32 tokenEndInSentenceExclusive = 3; required string ner = 4; optional string normalizedNER = 5; optional string entityType = 6; optional Timex timex = 7; optional string wikipediaEntity = 8; optional string gender = 9; optional uint32 entityMentionIndex = 10; optional uint32 canonicalEntityMentionIndex = 11; optional string entityMentionText = 12; } // // An entailed sentence fragment. // Created by the openie annotator. // message SentenceFragment { repeated uint32 tokenIndex = 1; optional uint32 root = 2; optional bool assumedTruth = 3; optional double score = 4; } // // The index of a token in a document, including the sentence // index and the offset. // message TokenLocation { optional uint32 sentenceIndex = 1; optional uint32 tokenIndex = 2; } // // An OpenIE relation triple. // Created by the openie annotator. // message RelationTriple { optional string subject = 1; // The surface form of the subject optional string relation = 2; // The surface form of the relation (required) optional string object = 3; // The surface form of the object optional double confidence = 4; // The [optional] confidence of the extraction repeated TokenLocation subjectTokens = 13; // The tokens comprising the subject of the triple repeated TokenLocation relationTokens = 14; // The tokens comprising the relation of the triple repeated TokenLocation objectTokens = 15; // The tokens comprising the object of the triple optional DependencyGraph tree = 8; // The dependency graph fragment for this triple optional bool istmod = 9; // If true, this expresses an implicit tmod relation optional bool prefixBe = 10; // If true, this relation string is missing a 'be' prefix optional bool suffixBe = 11; // If true, this relation string is missing a 'be' suffix optional bool suffixOf = 12; // If true, this relation string is missing a 'of' prefix } // // A map from strings to strings. // Used, minimally, in the CoNLLU featurizer // message MapStringString { repeated string key = 1; repeated string value = 2; } // // A map from integers to strings. // Used, minimally, in the CoNLLU featurizer // message MapIntString { repeated uint32 key = 1; repeated string value = 2; } // // Store section info // message Section { required uint32 charBegin = 1; required uint32 charEnd = 2; optional string author = 3; repeated uint32 sentenceIndexes = 4; optional string datetime = 5; repeated Quote quotes = 6; optional uint32 authorCharBegin = 7; optional uint32 authorCharEnd = 8; required Token xmlTag = 9; } // A message for requesting a semgrex // Each sentence stores information about the tokens making up the // corresponding graph // An alternative would have been to use the existing Document or // Sentence classes, but the problem with that is it would be // ambiguous which dependency object to use. message SemgrexRequest { message Dependencies { repeated Token token = 1; required DependencyGraph graph = 2; } repeated string semgrex = 1; repeated Dependencies query = 2; } // The response from running a semgrex // If you pass in M semgrex expressions and N dependency graphs, // this returns MxN nested results. Each SemgrexResult can match // multiple times in one graph // // You may want to send multiple semgrexes per query because // translating large numbers of dependency graphs to protobufs // will be expensive, so doing several queries at once will save time message SemgrexResponse { message NamedNode { required string name = 1; required int32 matchIndex = 2; } message NamedRelation { required string name = 1; required string reln = 2; } message NamedEdge { required string name = 1; required int32 source = 2; required int32 target = 3; optional string reln = 4; optional bool isExtra = 5; optional uint32 sourceCopy = 6; optional uint32 targetCopy = 7; } message Match { required int32 matchIndex = 1; repeated NamedNode node = 2; repeated NamedRelation reln = 3; repeated NamedEdge edge = 6; // when processing multiple dependency graphs at once, // which dependency graph this applies to // indexed from 0 optional int32 graphIndex = 4; // index of the semgrex expression this match applies to // indexed from 0 optional int32 semgrexIndex = 5; } message SemgrexResult { repeated Match match = 1; } message GraphResult { repeated SemgrexResult result = 1; } repeated GraphResult result = 1; } // A message for processing an Ssurgeon // Each sentence stores information about the tokens making up the // corresponding graph // An alternative would have been to use the existing Document or // Sentence classes, but the problem with that is it would be // ambiguous which dependency object to use. Another problem // is that if the intent is to use multiple graphs from a // Sentence, then edits to the nodes of one graph would show up // in the nodes of the other graph (same backing CoreLabels) // and the operations themselves may not have the intended effect. // The Ssurgeon is composed of two pieces, the semgrex and the // ssurgeon operations, along with some optional documentation. message SsurgeonRequest { message Ssurgeon { optional string semgrex = 1; repeated string operation = 2; optional string id = 3; optional string notes = 4; optional string language = 5; } repeated Ssurgeon ssurgeon = 1; repeated DependencyGraph graph = 2; } message SsurgeonResponse { message SsurgeonResult { optional DependencyGraph graph = 1; optional bool changed = 2; } repeated SsurgeonResult result = 1; } // It's possible to send in a whole document, but we // only care about the Sentences and Tokens message TokensRegexRequest { required Document doc = 1; repeated string pattern = 2; } // The result will be a nested structure: // repeated PatternMatch, one for each pattern // each PatternMatch has a repeated Match, // which tells you which sentence matched and where message TokensRegexResponse { message MatchLocation { optional string text = 1; optional int32 begin = 2; optional int32 end = 3; } message Match { required int32 sentence = 1; required MatchLocation match = 2; repeated MatchLocation group = 3; } message PatternMatch { repeated Match match = 1; } repeated PatternMatch match = 1; } // A protobuf which allows to pass in a document with basic // dependencies to be converted to enhanced message DependencyEnhancerRequest { required Document document = 1; oneof ref { Language language = 2; // The expected value of this is a regex which matches relative pronouns string relativePronouns = 3; } } // A version of ParseTree with a flattened structure so that deep trees // don't exceed the protobuf stack depth message FlattenedParseTree { message Node { oneof contents { bool openNode = 1; bool closeNode = 2; string value = 3; } optional double score = 4; } repeated Node nodes = 1; } // A protobuf for calling the java constituency parser evaluator from elsewhere message EvaluateParserRequest { message ParseResult { required FlattenedParseTree gold = 1; // repeated so you can send in kbest parses, if your parser handles that // note that this already includes a score field repeated FlattenedParseTree predicted = 2; } repeated ParseResult treebank = 1; } message EvaluateParserResponse { required double f1 = 1; optional double kbestF1 = 2; // keep track of the individual tree F1 scores repeated double treeF1 = 3; } // A protobuf for running Tsurgeon operations on constituency trees message TsurgeonRequest { message Operation { required string tregex = 1; repeated string tsurgeon = 2; } repeated Operation operations = 1; repeated FlattenedParseTree trees = 2; } // The results of the Tsurgeon operation message TsurgeonResponse { repeated FlattenedParseTree trees = 1; } // Sent in Morphology requests - a stream of sentences with tagged words message MorphologyRequest { message TaggedWord { required string word = 1; optional string xpos = 2; } repeated TaggedWord words = 1; } // Sent back from the Morphology request - the words and their tags message MorphologyResponse { message WordTagLemma { required string word = 1; optional string xpos = 2; required string lemma = 3; } repeated WordTagLemma words = 1; } // A request for converting constituency trees to dependency graphs message DependencyConverterRequest { repeated FlattenedParseTree trees = 1; } // The result of using the CoreNLP dependency converter. // One graph per tree message DependencyConverterResponse { message DependencyConversion { required DependencyGraph graph = 1; optional FlattenedParseTree tree = 2; } repeated DependencyConversion conversions = 1; }